[llvm-branch-commits] [llvm] [RISCV] Add TuneDisableLatencySchedHeuristic (PR #115858)

Pengcheng Wang via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Nov 27 21:07:48 PST 2024


https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/115858

>From bd7860cc7a3c51310a2eb8019a1a5e46e9c29411 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Wed, 27 Nov 2024 14:28:38 +0800
Subject: [PATCH 1/2] Add tune feature and set false by default

Created using spr 1.3.6-beta.1
---
 llvm/lib/Target/RISCV/RISCVFeatures.td        |    4 +
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp      |    9 +-
 .../CodeGen/RISCV/GlobalISel/alu-roundtrip.ll |    4 +-
 .../test/CodeGen/RISCV/GlobalISel/bitmanip.ll |  136 +-
 .../RISCV/GlobalISel/constbarrier-rv32.ll     |   18 +-
 .../RISCV/GlobalISel/constbarrier-rv64.ll     |    2 +-
 llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll    |    4 +-
 .../CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll  |    2 +-
 llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll |  361 +-
 .../test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll |    4 +-
 llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll |    4 +-
 llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll |  522 +-
 .../test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll |   12 +-
 llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll  |   24 +-
 llvm/test/CodeGen/RISCV/abds-neg.ll           |  524 +-
 llvm/test/CodeGen/RISCV/abds.ll               |  508 +-
 llvm/test/CodeGen/RISCV/abdu-neg.ll           |  716 +-
 llvm/test/CodeGen/RISCV/abdu.ll               |  488 +-
 llvm/test/CodeGen/RISCV/add-before-shl.ll     |   36 +-
 llvm/test/CodeGen/RISCV/add-imm.ll            |   24 +-
 llvm/test/CodeGen/RISCV/addcarry.ll           |   22 +-
 llvm/test/CodeGen/RISCV/addimm-mulimm.ll      |  126 +-
 llvm/test/CodeGen/RISCV/alu16.ll              |    4 +-
 llvm/test/CodeGen/RISCV/alu8.ll               |    4 +-
 llvm/test/CodeGen/RISCV/and.ll                |    2 +-
 .../RISCV/atomic-cmpxchg-branch-on-result.ll  |   32 +-
 llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll     |  384 +-
 llvm/test/CodeGen/RISCV/atomic-rmw.ll         | 2686 ++---
 llvm/test/CodeGen/RISCV/atomic-signext.ll     |  244 +-
 .../CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll |   72 +-
 .../CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll |   24 +-
 llvm/test/CodeGen/RISCV/avgceils.ll           |   32 +-
 llvm/test/CodeGen/RISCV/avgceilu.ll           |   20 +-
 llvm/test/CodeGen/RISCV/avgfloors.ll          |   20 +-
 llvm/test/CodeGen/RISCV/avgflooru.ll          |   40 +-
 llvm/test/CodeGen/RISCV/bf16-promote.ll       |    4 +-
 llvm/test/CodeGen/RISCV/bfloat-arith.ll       |  114 +-
 llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll     |   16 +-
 llvm/test/CodeGen/RISCV/bfloat-convert.ll     |  182 +-
 llvm/test/CodeGen/RISCV/bfloat-fcmp.ll        |    8 +-
 llvm/test/CodeGen/RISCV/bfloat-mem.ll         |    4 +-
 llvm/test/CodeGen/RISCV/bfloat.ll             |   24 +-
 llvm/test/CodeGen/RISCV/bitextract-mac.ll     |   12 +-
 llvm/test/CodeGen/RISCV/bswap-bitreverse.ll   | 1052 +-
 llvm/test/CodeGen/RISCV/calling-conv-half.ll  |   32 +-
 .../RISCV/calling-conv-ilp32-ilp32f-common.ll |  100 +-
 ...calling-conv-ilp32-ilp32f-ilp32d-common.ll |  252 +-
 .../test/CodeGen/RISCV/calling-conv-ilp32d.ll |   68 +-
 .../test/CodeGen/RISCV/calling-conv-ilp32e.ll |  616 +-
 .../calling-conv-ilp32f-ilp32d-common.ll      |   48 +-
 .../calling-conv-lp64-lp64f-lp64d-common.ll   |  116 +-
 llvm/test/CodeGen/RISCV/cmov-branch-opt.ll    |    8 +-
 llvm/test/CodeGen/RISCV/compress.ll           |    2 +-
 llvm/test/CodeGen/RISCV/condbinops.ll         |   80 +-
 llvm/test/CodeGen/RISCV/condops.ll            |  874 +-
 llvm/test/CodeGen/RISCV/copysign-casts.ll     |   96 +-
 llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll    | 1166 +--
 .../CodeGen/RISCV/ctz_zero_return_test.ll     |   62 +-
 llvm/test/CodeGen/RISCV/div-by-constant.ll    |   62 +-
 llvm/test/CodeGen/RISCV/div-pow2.ll           |   64 +-
 llvm/test/CodeGen/RISCV/div.ll                |   56 +-
 llvm/test/CodeGen/RISCV/double-arith.ll       |    8 +-
 .../RISCV/double-bitmanip-dagcombines.ll      |    2 +-
 .../test/CodeGen/RISCV/double-calling-conv.ll |   64 +-
 llvm/test/CodeGen/RISCV/double-convert.ll     |   86 +-
 llvm/test/CodeGen/RISCV/double-imm.ll         |    2 +-
 llvm/test/CodeGen/RISCV/double-intrinsics.ll  |    6 +-
 .../CodeGen/RISCV/double-previous-failure.ll  |    2 +-
 .../CodeGen/RISCV/double-round-conv-sat.ll    |  432 +-
 llvm/test/CodeGen/RISCV/double_reduct.ll      |   30 +-
 .../early-clobber-tied-def-subreg-liveness.ll |    4 +-
 llvm/test/CodeGen/RISCV/float-arith.ll        |    8 +-
 .../RISCV/float-bitmanip-dagcombines.ll       |    4 +-
 llvm/test/CodeGen/RISCV/float-convert.ll      |  104 +-
 llvm/test/CodeGen/RISCV/float-intrinsics.ll   |   98 +-
 .../CodeGen/RISCV/float-round-conv-sat.ll     |  288 +-
 .../test/CodeGen/RISCV/fold-addi-loadstore.ll |   24 +-
 .../CodeGen/RISCV/fold-binop-into-select.ll   |    2 +-
 llvm/test/CodeGen/RISCV/forced-atomics.ll     |   54 +-
 llvm/test/CodeGen/RISCV/fp128.ll              |   28 +-
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       |   38 +-
 llvm/test/CodeGen/RISCV/fpenv.ll              |    6 +-
 llvm/test/CodeGen/RISCV/ghccc-rv32.ll         |   80 +-
 llvm/test/CodeGen/RISCV/ghccc-rv64.ll         |   80 +-
 .../test/CodeGen/RISCV/ghccc-without-f-reg.ll |   40 +-
 llvm/test/CodeGen/RISCV/global-merge.ll       |    4 +-
 llvm/test/CodeGen/RISCV/half-arith-strict.ll  |   92 +-
 llvm/test/CodeGen/RISCV/half-arith.ll         |  182 +-
 .../RISCV/half-bitmanip-dagcombines.ll        |   12 +-
 llvm/test/CodeGen/RISCV/half-br-fcmp.ll       |   24 +-
 llvm/test/CodeGen/RISCV/half-convert.ll       |  938 +-
 llvm/test/CodeGen/RISCV/half-fcmp-strict.ll   |    8 +-
 llvm/test/CodeGen/RISCV/half-fcmp.ll          |   16 +-
 llvm/test/CodeGen/RISCV/half-intrinsics.ll    |   20 +-
 llvm/test/CodeGen/RISCV/half-mem.ll           |   16 +-
 .../test/CodeGen/RISCV/half-round-conv-sat.ll |  588 +-
 llvm/test/CodeGen/RISCV/half-select-fcmp.ll   |    4 +-
 llvm/test/CodeGen/RISCV/iabs.ll               |  104 +-
 llvm/test/CodeGen/RISCV/imm.ll                |   22 +-
 .../RISCV/inline-asm-d-constraint-f.ll        |    6 +-
 .../CodeGen/RISCV/inline-asm-d-modifier-N.ll  |    6 +-
 .../CodeGen/RISCV/interrupt-attr-nocall.ll    |   24 +-
 .../RISCV/intrinsic-cttz-elts-vscale.ll       |   58 +-
 .../RISCV/lack-of-signed-truncation-check.ll  |   12 +-
 .../RISCV/loop-strength-reduce-loop-invar.ll  |   44 +-
 llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll    |    2 +-
 .../RISCV/machinelicm-constant-phys-reg.ll    |    2 +-
 llvm/test/CodeGen/RISCV/memcmp-optsize.ll     |  262 +-
 llvm/test/CodeGen/RISCV/memcmp.ll             |  582 +-
 llvm/test/CodeGen/RISCV/memcpy.ll             |  146 +-
 llvm/test/CodeGen/RISCV/mul.ll                |  531 +-
 llvm/test/CodeGen/RISCV/neg-abs.ll            |    8 +-
 llvm/test/CodeGen/RISCV/or-is-add.ll          |    2 +-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll |   36 +-
 llvm/test/CodeGen/RISCV/pr51206.ll            |   24 +-
 llvm/test/CodeGen/RISCV/pr56457.ll            |   50 +-
 llvm/test/CodeGen/RISCV/pr58511.ll            |   10 +-
 llvm/test/CodeGen/RISCV/pr65025.ll            |    4 +-
 llvm/test/CodeGen/RISCV/pr68855.ll            |    4 +-
 llvm/test/CodeGen/RISCV/pr69586.ll            | 2654 ++---
 llvm/test/CodeGen/RISCV/pr84653_pr85190.ll    |   20 +-
 llvm/test/CodeGen/RISCV/pr95271.ll            |   50 +-
 ...regalloc-last-chance-recoloring-failure.ll |    8 +-
 llvm/test/CodeGen/RISCV/rem.ll                |   18 +-
 .../CodeGen/RISCV/riscv-codegenprepare-asm.ll |    2 +-
 .../CodeGen/RISCV/riscv-shifted-extend.ll     |   26 +-
 llvm/test/CodeGen/RISCV/rotl-rotr.ll          |  438 +-
 llvm/test/CodeGen/RISCV/rv32xtheadbb.ll       |   75 +-
 llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll       |   55 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |  449 +-
 llvm/test/CodeGen/RISCV/rv32zbs.ll            |   44 +-
 .../test/CodeGen/RISCV/rv64-double-convert.ll |   38 +-
 llvm/test/CodeGen/RISCV/rv64-float-convert.ll |   30 +-
 llvm/test/CodeGen/RISCV/rv64-half-convert.ll  |   38 +-
 llvm/test/CodeGen/RISCV/rv64-trampoline.ll    |   32 +-
 llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll   |    6 +-
 .../RISCV/rv64i-w-insts-legalization.ll       |   16 +-
 llvm/test/CodeGen/RISCV/rv64xtheadbb.ll       |  300 +-
 llvm/test/CodeGen/RISCV/rv64zba.ll            |    4 +-
 llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll  |    8 +-
 llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll       |   16 +-
 llvm/test/CodeGen/RISCV/rv64zbb.ll            |  518 +-
 llvm/test/CodeGen/RISCV/rv64zbkb.ll           |    2 +-
 .../RISCV/rvv/65704-illegal-instruction.ll    |    4 +-
 llvm/test/CodeGen/RISCV/rvv/abs-vp.ll         |   10 +-
 .../CodeGen/RISCV/rvv/active_lane_mask.ll     |  109 +-
 .../rvv/alloca-load-store-scalable-array.ll   |   20 +-
 .../rvv/alloca-load-store-scalable-struct.ll  |    2 +-
 .../rvv/alloca-load-store-vector-tuple.ll     |    6 +-
 .../CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll  |    6 +-
 .../CodeGen/RISCV/rvv/bitreverse-sdnode.ll    |  969 +-
 llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll  | 2415 ++---
 llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll   |  347 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll       | 1082 +-
 .../CodeGen/RISCV/rvv/calling-conv-fastcc.ll  |  318 +-
 llvm/test/CodeGen/RISCV/rvv/calling-conv.ll   |   12 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |  351 +-
 llvm/test/CodeGen/RISCV/rvv/compressstore.ll  |  172 +-
 .../RISCV/rvv/constant-folding-crash.ll       |   34 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll    | 1074 +-
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |  274 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll   |  350 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       | 1207 ++-
 llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll    | 1404 +--
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        | 1605 +--
 .../RISCV/rvv/dont-sink-splat-operands.ll     |   24 +-
 llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll    |   32 +-
 llvm/test/CodeGen/RISCV/rvv/expandload.ll     | 4325 ++++----
 .../CodeGen/RISCV/rvv/extract-subvector.ll    |    4 +-
 llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll  |   20 +-
 llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll  |   40 +-
 .../CodeGen/RISCV/rvv/extractelt-int-rv32.ll  |   14 +-
 .../CodeGen/RISCV/rvv/extractelt-int-rv64.ll  |    8 +-
 .../RISCV/rvv/fceil-constrained-sdnode.ll     |   20 +-
 llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll   |   60 +-
 .../RISCV/rvv/ffloor-constrained-sdnode.ll    |   20 +-
 llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll  |   60 +-
 .../rvv/fixed-vector-i8-index-cornercase.ll   |  108 +-
 .../RISCV/rvv/fixed-vectors-binop-splats.ll   |   24 +-
 .../RISCV/rvv/fixed-vectors-bitreverse-vp.ll  | 2347 ++---
 .../RISCV/rvv/fixed-vectors-bitreverse.ll     |  420 +-
 .../RISCV/rvv/fixed-vectors-bswap-vp.ll       |  980 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-bswap.ll  |  172 +-
 .../rvv/fixed-vectors-buildvec-of-binop.ll    |   73 +-
 .../rvv/fixed-vectors-calling-conv-fastcc.ll  |   56 +-
 .../RISCV/rvv/fixed-vectors-calling-conv.ll   |   46 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |  135 +-
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        | 3172 +++---
 .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll   |  532 +-
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       | 1288 +--
 .../CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll  |  172 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        | 3080 +++---
 .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll   |  556 +-
 .../rvv/fixed-vectors-deinterleave-load.ll    |   41 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-elen.ll   |   16 +-
 .../RISCV/rvv/fixed-vectors-extract-i1.ll     |   96 +-
 .../rvv/fixed-vectors-extract-subvector.ll    |    4 +-
 .../RISCV/rvv/fixed-vectors-extract.ll        |  124 +-
 .../fixed-vectors-fceil-constrained-sdnode.ll |   24 +-
 ...fixed-vectors-ffloor-constrained-sdnode.ll |   24 +-
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |  135 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |   16 +-
 .../RISCV/rvv/fixed-vectors-fmaximum.ll       |   22 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |   16 +-
 .../RISCV/rvv/fixed-vectors-fminimum.ll       |   22 +-
 ...d-vectors-fnearbyint-constrained-sdnode.ll |   20 +-
 .../RISCV/rvv/fixed-vectors-fp-buildvec.ll    |  550 +-
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll  |   33 +-
 .../RISCV/rvv/fixed-vectors-fp-setcc.ll       |   48 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     |  188 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll   |  168 +-
 .../RISCV/rvv/fixed-vectors-fptrunc-vp.ll     |    4 +-
 ...fixed-vectors-fround-constrained-sdnode.ll |   24 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fround.ll |   14 +-
 ...d-vectors-froundeven-constrained-sdnode.ll |   24 +-
 .../RISCV/rvv/fixed-vectors-froundeven.ll     |   14 +-
 ...fixed-vectors-ftrunc-constrained-sdnode.ll |   24 +-
 .../RISCV/rvv/fixed-vectors-insert-i1.ll      |    6 +-
 .../rvv/fixed-vectors-insert-subvector.ll     |   32 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-insert.ll |   30 +-
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   | 1557 ++-
 .../rvv/fixed-vectors-int-explodevector.ll    |  678 +-
 .../RISCV/rvv/fixed-vectors-int-interleave.ll |   45 +-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |   49 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    |  428 +-
 ...fixed-vectors-interleaved-access-zve32x.ll |   46 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  619 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-llrint.ll |  225 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-load.ll   |   16 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-lrint.ll  |  665 +-
 .../RISCV/rvv/fixed-vectors-mask-buildvec.ll  |   36 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  |  988 +-
 .../RISCV/rvv/fixed-vectors-masked-load-fp.ll |   18 +-
 .../rvv/fixed-vectors-masked-load-int.ll      |   12 +-
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |  416 +-
 .../rvv/fixed-vectors-masked-store-fp.ll      |   18 +-
 .../rvv/fixed-vectors-masked-store-int.ll     |   16 +-
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |   51 +-
 .../rvv/fixed-vectors-reduction-formation.ll  |   14 +-
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |  340 +-
 .../rvv/fixed-vectors-reduction-int-vp.ll     |   42 +-
 .../RISCV/rvv/fixed-vectors-reduction-int.ll  |  352 +-
 .../RISCV/rvv/fixed-vectors-rint-vp.ll        |   21 +-
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |  135 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |  135 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |  135 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-sad.ll    |   38 +-
 .../RISCV/rvv/fixed-vectors-scalarized.ll     |   64 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    | 3250 +++---
 .../RISCV/rvv/fixed-vectors-setcc-int-vp.ll   |    8 +-
 .../fixed-vectors-shuffle-changes-length.ll   |  153 +-
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll |   33 +-
 .../rvv/fixed-vectors-shuffle-deinterleave.ll |   36 +-
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   |    2 +-
 .../rvv/fixed-vectors-shuffle-reverse.ll      |  527 +-
 .../RISCV/rvv/fixed-vectors-shuffle-rotate.ll |  108 +-
 .../rvv/fixed-vectors-shufflevector-vnsrl.ll  |   31 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-store.ll  |   14 +-
 .../rvv/fixed-vectors-strided-load-combine.ll |   36 +-
 .../fixed-vectors-strided-load-store-asm.ll   |  128 +-
 .../RISCV/rvv/fixed-vectors-strided-vpload.ll |   20 +-
 .../RISCV/rvv/fixed-vectors-trunc-vp.ll       |   18 +-
 .../RISCV/rvv/fixed-vectors-unaligned.ll      |   22 +-
 .../fixed-vectors-vfcmp-constrained-sdnode.ll |  432 +-
 ...fixed-vectors-vfcmps-constrained-sdnode.ll |   66 +-
 .../RISCV/rvv/fixed-vectors-vfma-vp.ll        |  163 +-
 .../RISCV/rvv/fixed-vectors-vfmuladd-vp.ll    |   75 +-
 .../fixed-vectors-vfw-web-simplification.ll   |    4 +-
 .../RISCV/rvv/fixed-vectors-vpgather.ll       |   86 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vpload.ll |   14 +-
 .../RISCV/rvv/fixed-vectors-vpscatter.ll      |   98 +-
 .../RISCV/rvv/fixed-vectors-vpstore.ll        |    4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vrol.ll   |  166 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vror.ll   |  250 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       |    6 +-
 .../RISCV/rvv/fixed-vectors-vselect-vp.ll     |   36 +-
 .../RISCV/rvv/fixed-vectors-vselect.ll        |  168 +-
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       |   14 +-
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      |    8 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |  351 +-
 .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll |  281 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |  555 +-
 .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll |  281 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |  555 +-
 .../rvv/fnearbyint-constrained-sdnode.ll      |   20 +-
 .../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll    |   60 +-
 .../CodeGen/RISCV/rvv/fold-binary-reduce.ll   |    6 +-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     |  308 +-
 llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll   |   24 +-
 llvm/test/CodeGen/RISCV/rvv/frm-insert.ll     |   24 +-
 .../RISCV/rvv/fround-constrained-sdnode.ll    |   20 +-
 llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll  |   60 +-
 .../rvv/froundeven-constrained-sdnode.ll      |   20 +-
 .../CodeGen/RISCV/rvv/froundeven-sdnode.ll    |   60 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |   90 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll      |   20 +-
 .../RISCV/rvv/ftrunc-constrained-sdnode.ll    |   20 +-
 llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll  |   24 +-
 .../CodeGen/RISCV/rvv/insert-subvector.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll   |    8 +-
 .../CodeGen/RISCV/rvv/interleave-crash.ll     |    8 +-
 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll      |   10 +-
 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll       |   10 +-
 llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll  |   12 +-
 llvm/test/CodeGen/RISCV/rvv/memset-inline.ll  |   32 +-
 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll |   30 +-
 .../CodeGen/RISCV/rvv/mscatter-combine.ll     |   20 +-
 .../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll |   41 +-
 .../RISCV/rvv/mutate-prior-vsetvli-avl.ll     |   10 +-
 .../RISCV/rvv/named-vector-shuffle-reverse.ll |  594 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   |  322 +-
 llvm/test/CodeGen/RISCV/rvv/pr104480.ll       |   22 +-
 llvm/test/CodeGen/RISCV/rvv/pr52475.ll        |    2 +-
 llvm/test/CodeGen/RISCV/rvv/pr61561.ll        |   13 +-
 llvm/test/CodeGen/RISCV/rvv/pr88576.ll        |    4 +-
 llvm/test/CodeGen/RISCV/rvv/pr95865.ll        |   44 +-
 llvm/test/CodeGen/RISCV/rvv/rint-vp.ll        |  326 +-
 .../RISCV/rvv/riscv-codegenprepare-asm.ll     |    6 +-
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |  351 +-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |  351 +-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |  351 +-
 .../test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll |    2 +-
 .../test/CodeGen/RISCV/rvv/rvv-framelayout.ll |    4 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  540 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll       |  168 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |   32 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll  |    4 +-
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  |  136 +-
 .../rvv/splat-vector-split-i64-vl-sdnode.ll   |   98 +-
 llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll   |   12 +-
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll     |   40 +-
 llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll |    4 +-
 .../test/CodeGen/RISCV/rvv/strided-vpstore.ll |   28 +-
 .../RISCV/rvv/undef-earlyclobber-chain.ll     |   26 +-
 llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll        |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll    |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll        |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll       |    3 +-
 .../CodeGen/RISCV/rvv/vec3-setcc-crash.ll     |   40 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |   84 +-
 .../RISCV/rvv/vector-deinterleave-load.ll     |   85 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |   46 +-
 .../RISCV/rvv/vector-interleave-fixed.ll      |    8 +-
 .../RISCV/rvv/vector-interleave-store.ll      |   64 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    |  212 +-
 llvm/test/CodeGen/RISCV/rvv/vector-splice.ll  |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll       |   10 +-
 .../RISCV/rvv/vfadd-constrained-sdnode.ll     |  116 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll   |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |  266 +-
 .../RISCV/rvv/vfcmp-constrained-sdnode.ll     |  312 +-
 .../RISCV/rvv/vfcmps-constrained-sdnode.ll    |   84 +-
 .../CodeGen/RISCV/rvv/vfcopysign-sdnode.ll    |  184 +-
 .../RISCV/rvv/vfdiv-constrained-sdnode.ll     |  116 +-
 llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll   |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |  266 +-
 .../RISCV/rvv/vfirst-byte-compare-index.ll    |   10 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        | 5542 +++++-----
 .../RISCV/rvv/vfmadd-constrained-sdnode.ll    |  224 +-
 llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll  |  395 +-
 llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll   |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll       |   88 +-
 llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll   |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll       |   88 +-
 .../RISCV/rvv/vfmsub-constrained-sdnode.ll    |  152 +-
 .../RISCV/rvv/vfmul-constrained-sdnode.ll     |  116 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll   |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |  133 +-
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |  104 +-
 llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll       |   10 +-
 .../RISCV/rvv/vfnmadd-constrained-sdnode.ll   |  153 +-
 .../RISCV/rvv/vfnmsub-constrained-sdnode.ll   |  219 +-
 llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll      |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll  |  120 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |   26 +-
 .../RISCV/rvv/vfsqrt-constrained-sdnode.ll    |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll  |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll      |   66 +-
 .../RISCV/rvv/vfsub-constrained-sdnode.ll     |  116 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll   |  112 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |  266 +-
 llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll  |   68 +-
 llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll        |   10 +-
 llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll       |   11 +-
 llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll        |   10 +-
 llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll       |   11 +-
 llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll    |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll   |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll        |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll |   48 +-
 .../rvv/vp-reverse-mask-fixed-vectors.ll      |   24 +-
 .../test/CodeGen/RISCV/rvv/vp-reverse-mask.ll |   90 +-
 .../test/CodeGen/RISCV/rvv/vpgather-sdnode.ll |   44 +-
 llvm/test/CodeGen/RISCV/rvv/vpload.ll         |   14 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |   14 +-
 .../CodeGen/RISCV/rvv/vpscatter-sdnode.ll     |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vpstore.ll        |   28 +-
 .../RISCV/rvv/vreductions-fp-sdnode.ll        |   10 +-
 .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll    |   12 +-
 .../CodeGen/RISCV/rvv/vreductions-int-vp.ll   |   62 +-
 .../test/CodeGen/RISCV/rvv/vreductions-int.ll |   48 +-
 llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll    |   16 +-
 llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll        |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll   |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll       |    3 +-
 llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll    |  178 +-
 llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll    |  178 +-
 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll      |    4 +-
 .../CodeGen/RISCV/rvv/vscale-power-of-two.ll  |    2 +-
 .../RISCV/rvv/vscale-vw-web-simplification.ll |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll     |   42 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |  132 +-
 .../RISCV/rvv/vsetvli-insert-crossbb.ll       |    8 +-
 llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll     |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll        |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll        |    3 +-
 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll      |    4 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll     |   32 +-
 llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll       |    2 +-
 llvm/test/CodeGen/RISCV/sadd_sat.ll           |    8 +-
 llvm/test/CodeGen/RISCV/sadd_sat_plus.ll      |   68 +-
 .../CodeGen/RISCV/select-binop-identity.ll    |    2 +-
 llvm/test/CodeGen/RISCV/select-const.ll       |    8 +-
 llvm/test/CodeGen/RISCV/select.ll             |   16 +-
 llvm/test/CodeGen/RISCV/setcc-logic.ll        |    4 +-
 llvm/test/CodeGen/RISCV/sextw-removal.ll      |   84 +-
 llvm/test/CodeGen/RISCV/shift-amount-mod.ll   |   24 +-
 llvm/test/CodeGen/RISCV/shift-and.ll          |    4 +-
 llvm/test/CodeGen/RISCV/shifts.ll             |  172 +-
 llvm/test/CodeGen/RISCV/shl-cttz.ll           |   72 +-
 llvm/test/CodeGen/RISCV/shlimm-addimm.ll      |   36 +-
 .../CodeGen/RISCV/signed-truncation-check.ll  |   12 +-
 llvm/test/CodeGen/RISCV/split-offsets.ll      |   44 +-
 .../CodeGen/RISCV/split-udiv-by-constant.ll   |  566 +-
 .../CodeGen/RISCV/split-urem-by-constant.ll   |  140 +-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |  380 +-
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll    |  730 +-
 llvm/test/CodeGen/RISCV/ssub_sat_plus.ll      |   60 +-
 llvm/test/CodeGen/RISCV/stack-store-check.ll  |    8 +-
 llvm/test/CodeGen/RISCV/tail-calls.ll         |    6 +-
 llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll      |    2 +-
 llvm/test/CodeGen/RISCV/uadd_sat_plus.ll      |   40 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |  168 +-
 .../CodeGen/RISCV/unaligned-load-store.ll     |  159 +-
 ...unfold-masked-merge-scalar-variablemask.ll |    6 +-
 llvm/test/CodeGen/RISCV/unroll-loop-cse.ll    |   10 +-
 llvm/test/CodeGen/RISCV/urem-lkk.ll           |    4 +-
 .../CodeGen/RISCV/urem-seteq-illegal-types.ll |  252 +-
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll    |  420 +-
 llvm/test/CodeGen/RISCV/usub_sat_plus.ll      |    4 +-
 llvm/test/CodeGen/RISCV/vararg-ilp32e.ll      |    8 +-
 llvm/test/CodeGen/RISCV/vararg.ll             |  718 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 8948 +++++++++--------
 .../RISCV/wide-scalar-shift-legalization.ll   | 3884 +++----
 llvm/test/CodeGen/RISCV/xaluo.ll              | 1792 ++--
 llvm/test/CodeGen/RISCV/xtheadmac.ll          |   16 +-
 llvm/test/CodeGen/RISCV/xtheadmemidx.ll       |    4 +-
 466 files changed, 53106 insertions(+), 50852 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 6ec7f9ab78c0d4..509cdfc7d0d015 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1390,6 +1390,10 @@ def FeaturePredictableSelectIsExpensive
     : SubtargetFeature<"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
                        "Prefer likely predicted branches over selects">;
 
+def FeatureDisableLatencySchedHeuristic
+    : SubtargetFeature<"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+                       "Disable latency scheduling heuristic">;
+
 def TuneOptimizedZeroStrideLoad
    : SubtargetFeature<"optimized-zero-stride-load", "HasOptimizedZeroStrideLoad",
                       "true", "Optimized (perform fewer memory operations)"
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index ac81d8980fd3e0..7c890ce14f75c8 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -208,12 +208,9 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   Policy.OnlyTopDown = false;
   Policy.OnlyBottomUp = false;
 
-  // Enabling or Disabling the latency heuristic is a close call: It seems to
-  // help nearly no benchmark on out-of-order architectures, on the other hand
-  // it regresses register pressure on a few benchmarking.
-  // FIXME: This is from AArch64, but we haven't evaluated it on RISC-V.
-  // TODO: We may disable it for out-of-order architectures only.
-  Policy.DisableLatencyHeuristic = true;
+  // Disabling the latency heuristic can reduce the number of spills/reloads but
+  // will cause some regressions on some cores.
+  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 
   // Spilling is generally expensive on all RISC-V cores, so always enable
   // register-pressure tracking. This will increase compile time.
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
index 330f8b16065f13..ee414992a5245c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
@@ -25,8 +25,8 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV32IM-LABEL: add_i8_signext_i32:
 ; RV32IM:       # %bb.0: # %entry
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    slli a1, a1, 24
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -34,8 +34,8 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV64IM-LABEL: add_i8_signext_i32:
 ; RV64IM:       # %bb.0: # %entry
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    slli a1, a1, 56
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
index f33ba1d7a302ef..bce6dfacf8e82c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
@@ -6,8 +6,8 @@ define i2 @bitreverse_i2(i2 %x) {
 ; RV32-LABEL: bitreverse_i2:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    andi a1, a1, 2
 ; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    andi a1, a1, 2
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
@@ -15,8 +15,8 @@ define i2 @bitreverse_i2(i2 %x) {
 ; RV64-LABEL: bitreverse_i2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    andi a1, a1, 2
 ; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    andi a1, a1, 2
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
@@ -28,8 +28,8 @@ define i3 @bitreverse_i3(i3 %x) {
 ; RV32-LABEL: bitreverse_i3:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 2
-; RV32-NEXT:    andi a1, a1, 4
 ; RV32-NEXT:    andi a0, a0, 7
+; RV32-NEXT:    andi a1, a1, 4
 ; RV32-NEXT:    andi a2, a0, 2
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    srli a0, a0, 2
@@ -39,8 +39,8 @@ define i3 @bitreverse_i3(i3 %x) {
 ; RV64-LABEL: bitreverse_i3:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 2
-; RV64-NEXT:    andi a1, a1, 4
 ; RV64-NEXT:    andi a0, a0, 7
+; RV64-NEXT:    andi a1, a1, 4
 ; RV64-NEXT:    andi a2, a0, 2
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    srli a0, a0, 2
@@ -54,11 +54,11 @@ define i4 @bitreverse_i4(i4 %x) {
 ; RV32-LABEL: bitreverse_i4:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 3
-; RV32-NEXT:    andi a1, a1, 8
 ; RV32-NEXT:    slli a2, a0, 1
+; RV32-NEXT:    andi a0, a0, 15
+; RV32-NEXT:    andi a1, a1, 8
 ; RV32-NEXT:    andi a2, a2, 4
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    andi a0, a0, 15
 ; RV32-NEXT:    srli a2, a0, 1
 ; RV32-NEXT:    andi a2, a2, 2
 ; RV32-NEXT:    or a1, a1, a2
@@ -69,11 +69,11 @@ define i4 @bitreverse_i4(i4 %x) {
 ; RV64-LABEL: bitreverse_i4:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    andi a1, a1, 8
 ; RV64-NEXT:    slli a2, a0, 1
+; RV64-NEXT:    andi a0, a0, 15
+; RV64-NEXT:    andi a1, a1, 8
 ; RV64-NEXT:    andi a2, a2, 4
 ; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    srli a2, a0, 1
 ; RV64-NEXT:    andi a2, a2, 2
 ; RV64-NEXT:    or a1, a1, a2
@@ -88,21 +88,21 @@ define i7 @bitreverse_i7(i7 %x) {
 ; RV32-LABEL: bitreverse_i7:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 6
-; RV32-NEXT:    andi a1, a1, 64
 ; RV32-NEXT:    slli a2, a0, 4
+; RV32-NEXT:    slli a3, a0, 2
+; RV32-NEXT:    andi a0, a0, 127
+; RV32-NEXT:    andi a1, a1, 64
 ; RV32-NEXT:    andi a2, a2, 32
+; RV32-NEXT:    andi a3, a3, 16
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    slli a2, a0, 2
-; RV32-NEXT:    andi a2, a2, 16
-; RV32-NEXT:    andi a0, a0, 127
-; RV32-NEXT:    andi a3, a0, 8
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    andi a2, a0, 8
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a3, a0, 2
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    srli a2, a0, 2
-; RV32-NEXT:    andi a2, a2, 4
-; RV32-NEXT:    srli a3, a0, 4
-; RV32-NEXT:    andi a3, a3, 2
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    andi a3, a3, 4
+; RV32-NEXT:    andi a2, a2, 2
+; RV32-NEXT:    or a2, a3, a2
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    srli a0, a0, 6
 ; RV32-NEXT:    or a0, a1, a0
@@ -111,21 +111,21 @@ define i7 @bitreverse_i7(i7 %x) {
 ; RV64-LABEL: bitreverse_i7:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 6
-; RV64-NEXT:    andi a1, a1, 64
 ; RV64-NEXT:    slli a2, a0, 4
+; RV64-NEXT:    slli a3, a0, 2
+; RV64-NEXT:    andi a0, a0, 127
+; RV64-NEXT:    andi a1, a1, 64
 ; RV64-NEXT:    andi a2, a2, 32
+; RV64-NEXT:    andi a3, a3, 16
 ; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    slli a2, a0, 2
-; RV64-NEXT:    andi a2, a2, 16
-; RV64-NEXT:    andi a0, a0, 127
-; RV64-NEXT:    andi a3, a0, 8
-; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    andi a2, a0, 8
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a3, a0, 2
 ; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    srli a2, a0, 2
-; RV64-NEXT:    andi a2, a2, 4
-; RV64-NEXT:    srli a3, a0, 4
-; RV64-NEXT:    andi a3, a3, 2
-; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    srli a2, a0, 4
+; RV64-NEXT:    andi a3, a3, 4
+; RV64-NEXT:    andi a2, a2, 2
+; RV64-NEXT:    or a2, a3, a2
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    srli a0, a0, 6
 ; RV64-NEXT:    or a0, a1, a0
@@ -139,33 +139,33 @@ define i24 @bitreverse_i24(i24 %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 16
 ; RV32-NEXT:    lui a2, 4096
+; RV32-NEXT:    lui a3, 1048335
 ; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    addi a3, a3, 240
 ; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    lui a1, 1048335
-; RV32-NEXT:    addi a1, a1, 240
-; RV32-NEXT:    and a3, a1, a2
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    srli a3, a3, 4
+; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a1, a0, a1
 ; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    lui a1, 1047757
-; RV32-NEXT:    addi a1, a1, -820
-; RV32-NEXT:    and a3, a1, a2
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    lui a3, 1047757
+; RV32-NEXT:    addi a3, a3, -820
+; RV32-NEXT:    srli a1, a1, 4
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a1, a0, a1
 ; RV32-NEXT:    slli a0, a0, 2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    lui a1, 1047211
-; RV32-NEXT:    addi a1, a1, -1366
-; RV32-NEXT:    and a2, a1, a2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    lui a3, 1047211
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    srli a2, a2, 1
 ; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    or a0, a2, a0
 ; RV32-NEXT:    ret
 ;
@@ -173,33 +173,33 @@ define i24 @bitreverse_i24(i24 %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 16
 ; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    lui a3, 1048335
 ; RV64-NEXT:    addiw a2, a2, -1
+; RV64-NEXT:    addiw a3, a3, 240
 ; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    srli a0, a0, 16
 ; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    lui a1, 1048335
-; RV64-NEXT:    addiw a1, a1, 240
-; RV64-NEXT:    and a3, a1, a2
-; RV64-NEXT:    and a3, a0, a3
-; RV64-NEXT:    srli a3, a3, 4
+; RV64-NEXT:    and a1, a3, a2
+; RV64-NEXT:    and a1, a0, a1
 ; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    or a0, a3, a0
-; RV64-NEXT:    lui a1, 1047757
-; RV64-NEXT:    addiw a1, a1, -820
-; RV64-NEXT:    and a3, a1, a2
-; RV64-NEXT:    and a3, a0, a3
-; RV64-NEXT:    srli a3, a3, 2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    lui a3, 1047757
+; RV64-NEXT:    addiw a3, a3, -820
+; RV64-NEXT:    srli a1, a1, 4
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    and a1, a3, a2
+; RV64-NEXT:    and a1, a0, a1
 ; RV64-NEXT:    slli a0, a0, 2
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    or a0, a3, a0
-; RV64-NEXT:    lui a1, 1047211
-; RV64-NEXT:    addiw a1, a1, -1366
-; RV64-NEXT:    and a2, a1, a2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    lui a3, 1047211
+; RV64-NEXT:    addiw a3, a3, -1366
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    and a2, a0, a2
-; RV64-NEXT:    srli a2, a2, 1
 ; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    srli a2, a2, 1
+; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    or a0, a2, a0
 ; RV64-NEXT:    ret
   %rev = call i24 @llvm.bitreverse.i24(i24 %x)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
index 70d1b25309c844..cf7cef83bcc135 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
@@ -21,34 +21,34 @@ define void @constant_fold_barrier_i128(ptr %p) {
 ; RV32-LABEL: constant_fold_barrier_i128:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    slli a1, a1, 11
 ; RV32-NEXT:    lw a2, 0(a0)
 ; RV32-NEXT:    lw a3, 4(a0)
 ; RV32-NEXT:    lw a4, 8(a0)
 ; RV32-NEXT:    lw a5, 12(a0)
+; RV32-NEXT:    slli a1, a1, 11
 ; RV32-NEXT:    and a2, a2, a1
 ; RV32-NEXT:    and a3, a3, zero
 ; RV32-NEXT:    and a4, a4, zero
 ; RV32-NEXT:    and a5, a5, zero
 ; RV32-NEXT:    add a2, a2, a1
-; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    add a6, a3, zero
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    sltu a3, a6, a3
 ; RV32-NEXT:    add a6, a6, a1
 ; RV32-NEXT:    seqz a7, a6
 ; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    add a7, a4, zero
+; RV32-NEXT:    add a5, a5, zero
+; RV32-NEXT:    sltu a4, a7, a4
 ; RV32-NEXT:    or a1, a3, a1
-; RV32-NEXT:    add a3, a4, zero
-; RV32-NEXT:    sltu a4, a3, a4
-; RV32-NEXT:    add a3, a3, a1
-; RV32-NEXT:    seqz a7, a3
-; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    add a7, a7, a1
+; RV32-NEXT:    seqz a3, a7
+; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    or a1, a4, a1
-; RV32-NEXT:    add a5, a5, zero
 ; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a6, 4(a0)
-; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a7, 8(a0)
 ; RV32-NEXT:    sw a1, 12(a0)
 ; RV32-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
index 51e8b6da39d099..2c3e3faddc3916 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
@@ -21,9 +21,9 @@ define i128 @constant_fold_barrier_i128(i128 %x) {
 ; RV64-LABEL: constant_fold_barrier_i128:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    and a1, a1, zero
 ; RV64-NEXT:    slli a2, a2, 11
 ; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    and a1, a1, zero
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    sltu a2, a0, a2
 ; RV64-NEXT:    add a1, a1, zero
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
index 05989c310541b8..1156edffe91943 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
@@ -117,8 +117,8 @@ define i64 @abs64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    sltu a3, a0, a2
 ; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    sltu a3, a0, a2
 ; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    xor a1, a1, a2
@@ -128,8 +128,8 @@ define i64 @abs64(i64 %x) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    add a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a0, a2
 ; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a0, a2
 ; RV32ZBB-NEXT:    add a1, a1, a3
 ; RV32ZBB-NEXT:    xor a0, a0, a2
 ; RV32ZBB-NEXT:    xor a1, a1, a2
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
index c558639fda424e..68bf9240ccd1df 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
@@ -302,8 +302,8 @@ define i64 @rori_i64(i64 %a) nounwind {
 ; CHECK-NEXT:    slli a2, a0, 31
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    slli a3, a1, 31
-; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index 1184905c17edea..7f22127ad3536c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 16
@@ -63,11 +63,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a6, 61681
 ; RV32I-NEXT:    addi a5, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a4, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a3, a2, -241
+; RV32I-NEXT:    addi a4, a3, 819
+; RV32I-NEXT:    addi a3, a6, -241
 ; RV32I-NEXT:    li a2, 32
 ; RV32I-NEXT:    beqz a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
@@ -155,22 +155,22 @@ define i32 @cttz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    not a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a1, a0
-; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -196,11 +196,11 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: cttz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    beqz a0, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    not a1, a0
@@ -271,17 +271,17 @@ define i32 @ctpop_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -305,39 +305,39 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    li a1, 0
@@ -364,39 +364,39 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
 ; RV32I-NEXT:  .LBB6_2:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltiu a0, a0, 2
@@ -429,39 +429,39 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
 ; RV32I-NEXT:  .LBB7_2:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltiu a0, a0, 2
@@ -491,39 +491,39 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    xori a0, a0, 1
@@ -548,39 +548,39 @@ define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    xori a0, a0, 1
@@ -872,8 +872,8 @@ define i64 @abs_i64(i64 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srai a2, a1, 31
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    xor a0, a0, a2
 ; CHECK-NEXT:    xor a1, a1, a2
@@ -923,15 +923,15 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 24
 ; RV32I-NEXT:    srli a2, a0, 24
+; RV32I-NEXT:    lui a3, 16
 ; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -256
-; RV32I-NEXT:    and a3, a0, a2
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    addi a3, a3, -256
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: bswap_i32:
@@ -949,25 +949,24 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a1, 24
 ; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    lui a4, 16
+; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    slli a6, a0, 24
 ; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    lui a3, 16
-; RV32I-NEXT:    addi a3, a3, -256
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    or a2, a1, a4
-; RV32I-NEXT:    slli a1, a0, 24
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a4, a0, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    or a1, a0, a4
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    srli a6, a0, 8
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    and a4, a6, a4
+; RV32I-NEXT:    or a2, a2, a5
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    slli a5, a0, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    or a0, a2, a1
+; RV32I-NEXT:    or a1, a3, a5
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: bswap_i64:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
index 80e43c94aab0e6..a647eae82dddfc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
@@ -111,8 +111,8 @@ define i32 @packh_i32(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: packh_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a2, 16
-; CHECK-NEXT:    addi a2, a2, -256
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    addi a2, a2, -256
 ; CHECK-NEXT:    slli a1, a1, 8
 ; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    or a0, a1, a0
@@ -149,8 +149,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: packh_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a1, 16
-; CHECK-NEXT:    addi a1, a1, -256
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    addi a1, a1, -256
 ; CHECK-NEXT:    slli a2, a2, 8
 ; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
index 2bd0c78659b004..9584270d8e66f5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
@@ -1025,8 +1025,8 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: pack_i64_2:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a1, a0
@@ -1337,8 +1337,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ; RV64I-LABEL: array_index_lshr_sh3_sh3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 58
-; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ld a0, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 30b7d4b89f49f2..961811d3b623c7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -17,32 +17,32 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -75,32 +75,32 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -142,32 +142,32 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a2, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -202,35 +202,35 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li s0, -1
 ; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srliw a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    lui a2, 4112
 ; RV64I-NEXT:    srli s1, a1, 32
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    addiw a1, a3, -241
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    addiw a1, a2, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    beqz s1, .LBB3_2
 ; RV64I-NEXT:  # %bb.1:
@@ -276,32 +276,32 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    srliw a0, a0, 2
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -334,29 +334,27 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 21845
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addi a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 32
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 13107
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
@@ -365,20 +363,22 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -414,24 +414,24 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    not a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -457,24 +457,24 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    not a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -501,24 +501,24 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    li s0, -1
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    addi a1, s1, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    slli s1, s1, 32
@@ -560,24 +560,24 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    addi a1, s0, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    slli s0, s0, 32
@@ -622,19 +622,17 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    not a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 21845
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    addi a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 13107
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
@@ -643,19 +641,21 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -688,19 +688,19 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -725,19 +725,19 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -762,23 +762,23 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lwu a0, 0(a0)
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -811,8 +811,6 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    lui a2, 13107
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
@@ -821,19 +819,21 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -998,8 +998,8 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: minu_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 32
-; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    srli a3, a3, 32
 ; RV64I-NEXT:    bltu a2, a3, .LBB23_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1011,8 +1011,8 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBB-LABEL: minu_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    minu a0, a0, a1
 ; RV64ZBB-NEXT:    sext.w a0, a0
@@ -1046,8 +1046,8 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: maxu_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 32
-; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    srli a3, a3, 32
 ; RV64I-NEXT:    bltu a3, a2, .LBB25_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1059,8 +1059,8 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBB-LABEL: maxu_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    sext.w a0, a0
@@ -1186,15 +1186,15 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slliw a1, a0, 24
 ; RV64I-NEXT:    srliw a2, a0, 24
+; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
-; RV64I-NEXT:    and a3, a0, a2
-; RV64I-NEXT:    slliw a3, a3, 8
-; RV64I-NEXT:    srliw a0, a0, 8
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    addiw a3, a3, -256
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    slliw a0, a0, 8
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: bswap_i32:
@@ -1213,15 +1213,15 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 24
 ; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    lui a4, 16
 ; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    lui a3, 16
-; RV64I-NEXT:    addi a3, a3, -256
-; RV64I-NEXT:    and a4, a0, a3
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    srliw a0, a0, 8
-; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 8
+; RV64I-NEXT:    addi a4, a4, -256
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    sw a0, 0(a1)
 ; RV64I-NEXT:    ret
 ;
@@ -1243,29 +1243,29 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 56
 ; RV64I-NEXT:    srli a2, a0, 56
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
-; RV64I-NEXT:    and a3, a0, a2
-; RV64I-NEXT:    slli a3, a3, 40
+; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    and a2, a4, a2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    lui a2, 4080
-; RV64I-NEXT:    and a3, a0, a2
-; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    addiw a3, a3, -256
+; RV64I-NEXT:    and a4, a4, a3
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    srli a4, a0, 24
-; RV64I-NEXT:    and a2, a4, a2
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    lui a3, 1044480
-; RV64I-NEXT:    and a4, a0, a3
-; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    and a4, a4, a2
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 24
 ; RV64I-NEXT:    or a2, a2, a4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    lui a4, 1044480
+; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    slli a3, a3, 40
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: bswap_i64:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index 5cf2619a476bc0..338476a1bec83f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -98,8 +98,8 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: pack_i64_2:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a1, a0
@@ -147,8 +147,8 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: packh_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -157,8 +157,8 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBKB-LABEL: packh_i32:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    lui a2, 16
-; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    andi a0, a0, 255
+; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    slli a1, a1, 8
 ; RV64ZBKB-NEXT:    and a1, a1, a2
 ; RV64ZBKB-NEXT:    or a0, a1, a0
@@ -195,8 +195,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: packh_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -205,8 +205,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind {
 ; RV64ZBKB-LABEL: packh_i64:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    lui a2, 16
-; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    andi a0, a0, 255
+; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    slli a1, a1, 8
 ; RV64ZBKB-NEXT:    and a1, a1, a2
 ; RV64ZBKB-NEXT:    or a0, a1, a0
@@ -307,10 +307,10 @@ define i64 @pack_i64_allWUsers(i32 signext %0, i32 signext %1, i32 signext %2) {
 ; RV64I-LABEL: pack_i64_allWUsers:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    slli a2, a2, 32
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    slli a2, a2, 32
 ; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index 972cc9d66bfb71..fc9be949884511 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -524,8 +524,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-NEXT:    sw a6, 40(sp)
 ; ILP32-NEXT:    sw a7, 44(sp)
 ; ILP32-NEXT:    addi a1, a0, 7
-; ILP32-NEXT:    andi a1, a1, -8
 ; ILP32-NEXT:    addi a0, a0, 15
+; ILP32-NEXT:    andi a1, a1, -8
 ; ILP32-NEXT:    sw a0, 12(sp)
 ; ILP32-NEXT:    lw a0, 0(a1)
 ; ILP32-NEXT:    lw a1, 4(a1)
@@ -618,8 +618,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV64-NEXT:    sd a6, 64(sp)
 ; RV64-NEXT:    sd a7, 72(sp)
 ; RV64-NEXT:    addi a1, a0, 7
-; RV64-NEXT:    andi a1, a1, -8
 ; RV64-NEXT:    addi a0, a0, 15
+; RV64-NEXT:    andi a1, a1, -8
 ; RV64-NEXT:    sd a0, 8(sp)
 ; RV64-NEXT:    ld a0, 0(a1)
 ; RV64-NEXT:    addi sp, sp, 80
@@ -642,8 +642,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32-WITHFP-NEXT:    addi a1, a0, 7
-; RV32-WITHFP-NEXT:    andi a1, a1, -8
 ; RV32-WITHFP-NEXT:    addi a0, a0, 15
+; RV32-WITHFP-NEXT:    andi a1, a1, -8
 ; RV32-WITHFP-NEXT:    sw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    lw a0, 0(a1)
 ; RV32-WITHFP-NEXT:    lw a1, 4(a1)
@@ -669,8 +669,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV64-WITHFP-NEXT:    sd a6, 48(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 56(s0)
 ; RV64-WITHFP-NEXT:    addi a1, a0, 7
-; RV64-WITHFP-NEXT:    andi a1, a1, -8
 ; RV64-WITHFP-NEXT:    addi a0, a0, 15
+; RV64-WITHFP-NEXT:    andi a1, a1, -8
 ; RV64-WITHFP-NEXT:    sd a0, -24(s0)
 ; RV64-WITHFP-NEXT:    ld a0, 0(a1)
 ; RV64-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -863,8 +863,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-NEXT:    sw a6, 24(sp)
 ; ILP32-NEXT:    sw a7, 28(sp)
 ; ILP32-NEXT:    addi a3, a0, 7
-; ILP32-NEXT:    andi a3, a3, -8
 ; ILP32-NEXT:    addi a0, a0, 15
+; ILP32-NEXT:    andi a3, a3, -8
 ; ILP32-NEXT:    sw a0, 4(sp)
 ; ILP32-NEXT:    lw a4, 0(a3)
 ; ILP32-NEXT:    lw a3, 4(a3)
@@ -966,8 +966,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV64-NEXT:    sd a6, 48(sp)
 ; RV64-NEXT:    sd a7, 56(sp)
 ; RV64-NEXT:    addi a2, a0, 7
-; RV64-NEXT:    andi a2, a2, -8
 ; RV64-NEXT:    addi a0, a0, 15
+; RV64-NEXT:    andi a2, a2, -8
 ; RV64-NEXT:    sd a0, 8(sp)
 ; RV64-NEXT:    ld a0, 0(a2)
 ; RV64-NEXT:    add a0, a1, a0
@@ -989,8 +989,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw a6, 16(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 20(s0)
 ; RV32-WITHFP-NEXT:    addi a3, a0, 7
-; RV32-WITHFP-NEXT:    andi a3, a3, -8
 ; RV32-WITHFP-NEXT:    addi a0, a0, 15
+; RV32-WITHFP-NEXT:    andi a3, a3, -8
 ; RV32-WITHFP-NEXT:    sw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    lw a4, 0(a3)
 ; RV32-WITHFP-NEXT:    lw a3, 4(a3)
@@ -1019,8 +1019,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV64-WITHFP-NEXT:    sd a6, 32(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 40(s0)
 ; RV64-WITHFP-NEXT:    addi a2, a0, 7
-; RV64-WITHFP-NEXT:    andi a2, a2, -8
 ; RV64-WITHFP-NEXT:    addi a0, a0, 15
+; RV64-WITHFP-NEXT:    andi a2, a2, -8
 ; RV64-WITHFP-NEXT:    sd a0, -24(s0)
 ; RV64-WITHFP-NEXT:    ld a0, 0(a2)
 ; RV64-WITHFP-NEXT:    add a0, a1, a0
@@ -1169,9 +1169,9 @@ define void @va3_caller() nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    lui a0, 5
-; RV64-NEXT:    addiw a2, a0, -480
+; RV64-NEXT:    lui a1, 5
 ; RV64-NEXT:    li a0, 2
+; RV64-NEXT:    addiw a2, a1, -480
 ; RV64-NEXT:    li a1, 1111
 ; RV64-NEXT:    call va3
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1201,9 +1201,9 @@ define void @va3_caller() nounwind {
 ; RV64-WITHFP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    addi s0, sp, 16
-; RV64-WITHFP-NEXT:    lui a0, 5
-; RV64-WITHFP-NEXT:    addiw a2, a0, -480
+; RV64-WITHFP-NEXT:    lui a1, 5
 ; RV64-WITHFP-NEXT:    li a0, 2
+; RV64-WITHFP-NEXT:    addiw a2, a1, -480
 ; RV64-WITHFP-NEXT:    li a1, 1111
 ; RV64-WITHFP-NEXT:    call va3
 ; RV64-WITHFP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index fed4a242a695b4..c9a48acb8d14a9 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -11,8 +11,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -23,8 +23,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -62,8 +62,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -74,8 +74,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -113,8 +113,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -125,8 +125,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -164,8 +164,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -176,8 +176,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -266,8 +266,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -278,8 +278,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -674,10 +674,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    sltu t3, a6, t0
 ; RV32I-NEXT:    sub t1, t1, t2
-; RV32I-NEXT:    sub t1, t1, t3
 ; RV32I-NEXT:    sub a6, a6, t0
-; RV32I-NEXT:    sltu t0, a6, t6
-; RV32I-NEXT:    sub t0, t1, t0
+; RV32I-NEXT:    sub t0, t1, t3
+; RV32I-NEXT:    sltu t1, a6, t6
+; RV32I-NEXT:    sub t0, t0, t1
 ; RV32I-NEXT:    sub a6, a6, t6
 ; RV32I-NEXT:  .LBB11_13:
 ; RV32I-NEXT:    snez t1, a6
@@ -694,20 +694,20 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB11_16:
 ; RV32I-NEXT:    or a3, a1, a2
-; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    neg a4, a6
-; RV32I-NEXT:    sltu a5, a4, a3
-; RV32I-NEXT:    neg a6, t0
-; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    neg a5, t0
 ; RV32I-NEXT:    snez a6, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    add a2, a2, a6
+; RV32I-NEXT:    sltu a6, a4, a3
 ; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -793,10 +793,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    sltu t3, a6, t0
 ; RV32ZBB-NEXT:    sub t1, t1, t2
-; RV32ZBB-NEXT:    sub t1, t1, t3
 ; RV32ZBB-NEXT:    sub a6, a6, t0
-; RV32ZBB-NEXT:    sltu t0, a6, t6
-; RV32ZBB-NEXT:    sub t0, t1, t0
+; RV32ZBB-NEXT:    sub t0, t1, t3
+; RV32ZBB-NEXT:    sltu t1, a6, t6
+; RV32ZBB-NEXT:    sub t0, t0, t1
 ; RV32ZBB-NEXT:    sub a6, a6, t6
 ; RV32ZBB-NEXT:  .LBB11_13:
 ; RV32ZBB-NEXT:    snez t1, a6
@@ -813,20 +813,20 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB11_16:
 ; RV32ZBB-NEXT:    or a3, a1, a2
-; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    neg a4, a6
-; RV32ZBB-NEXT:    sltu a5, a4, a3
-; RV32ZBB-NEXT:    neg a6, t0
-; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    neg a5, t0
 ; RV32ZBB-NEXT:    snez a6, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    add a2, a2, a6
+; RV32ZBB-NEXT:    sltu a6, a4, a3
 ; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:    sub a4, a4, a3
-; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -921,10 +921,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    sltu t3, a6, t0
 ; RV32I-NEXT:    sub t1, t1, t2
-; RV32I-NEXT:    sub t1, t1, t3
 ; RV32I-NEXT:    sub a6, a6, t0
-; RV32I-NEXT:    sltu t0, a6, t6
-; RV32I-NEXT:    sub t0, t1, t0
+; RV32I-NEXT:    sub t0, t1, t3
+; RV32I-NEXT:    sltu t1, a6, t6
+; RV32I-NEXT:    sub t0, t0, t1
 ; RV32I-NEXT:    sub a6, a6, t6
 ; RV32I-NEXT:  .LBB12_13:
 ; RV32I-NEXT:    snez t1, a6
@@ -941,20 +941,20 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB12_16:
 ; RV32I-NEXT:    or a3, a1, a2
-; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    neg a4, a6
-; RV32I-NEXT:    sltu a5, a4, a3
-; RV32I-NEXT:    neg a6, t0
-; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    neg a5, t0
 ; RV32I-NEXT:    snez a6, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    add a2, a2, a6
+; RV32I-NEXT:    sltu a6, a4, a3
 ; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1040,10 +1040,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    sltu t3, a6, t0
 ; RV32ZBB-NEXT:    sub t1, t1, t2
-; RV32ZBB-NEXT:    sub t1, t1, t3
 ; RV32ZBB-NEXT:    sub a6, a6, t0
-; RV32ZBB-NEXT:    sltu t0, a6, t6
-; RV32ZBB-NEXT:    sub t0, t1, t0
+; RV32ZBB-NEXT:    sub t0, t1, t3
+; RV32ZBB-NEXT:    sltu t1, a6, t6
+; RV32ZBB-NEXT:    sub t0, t0, t1
 ; RV32ZBB-NEXT:    sub a6, a6, t6
 ; RV32ZBB-NEXT:  .LBB12_13:
 ; RV32ZBB-NEXT:    snez t1, a6
@@ -1060,20 +1060,20 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB12_16:
 ; RV32ZBB-NEXT:    or a3, a1, a2
-; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    neg a4, a6
-; RV32ZBB-NEXT:    sltu a5, a4, a3
-; RV32ZBB-NEXT:    neg a6, t0
-; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    neg a5, t0
 ; RV32ZBB-NEXT:    snez a6, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    add a2, a2, a6
+; RV32ZBB-NEXT:    sltu a6, a4, a3
 ; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:    sub a4, a4, a3
-; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1120,8 +1120,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    bge a0, a1, .LBB13_3
@@ -1140,8 +1140,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    mv a2, a0
 ; RV64I-NEXT:    bge a0, a1, .LBB13_3
@@ -1175,8 +1175,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    bge a0, a1, .LBB14_3
@@ -1195,8 +1195,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    mv a2, a0
 ; RV64I-NEXT:    bge a0, a1, .LBB14_3
@@ -1449,26 +1449,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    mv a4, a7
 ; RV32I-NEXT:  .LBB17_19:
-; RV32I-NEXT:    sltu a6, t3, a4
-; RV32I-NEXT:    sub a7, t4, a5
-; RV32I-NEXT:    sltu a5, a2, a1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    sltu a7, t3, a4
+; RV32I-NEXT:    sub a5, t4, a5
+; RV32I-NEXT:    sltu a6, a2, a1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beq t1, a3, .LBB17_21
 ; RV32I-NEXT:  # %bb.20:
 ; RV32I-NEXT:    sltu a7, t1, a3
 ; RV32I-NEXT:  .LBB17_21:
 ; RV32I-NEXT:    sub a4, t3, a4
-; RV32I-NEXT:    sltu t0, a4, a7
-; RV32I-NEXT:    sub a6, a6, t0
-; RV32I-NEXT:    sub a4, a4, a7
 ; RV32I-NEXT:    sub a3, t1, a3
-; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a2, a2, a1
+; RV32I-NEXT:    sltu a1, a4, a7
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a5, a5, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1576,26 +1576,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    mv a5, t0
 ; RV32ZBB-NEXT:    mv a4, a7
 ; RV32ZBB-NEXT:  .LBB17_19:
-; RV32ZBB-NEXT:    sltu a6, t3, a4
-; RV32ZBB-NEXT:    sub a7, t4, a5
-; RV32ZBB-NEXT:    sltu a5, a2, a1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    mv a7, a5
+; RV32ZBB-NEXT:    sltu a7, t3, a4
+; RV32ZBB-NEXT:    sub a5, t4, a5
+; RV32ZBB-NEXT:    sltu a6, a2, a1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beq t1, a3, .LBB17_21
 ; RV32ZBB-NEXT:  # %bb.20:
 ; RV32ZBB-NEXT:    sltu a7, t1, a3
 ; RV32ZBB-NEXT:  .LBB17_21:
 ; RV32ZBB-NEXT:    sub a4, t3, a4
-; RV32ZBB-NEXT:    sltu t0, a4, a7
-; RV32ZBB-NEXT:    sub a6, a6, t0
-; RV32ZBB-NEXT:    sub a4, a4, a7
 ; RV32ZBB-NEXT:    sub a3, t1, a3
-; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a2, a2, a1
+; RV32ZBB-NEXT:    sltu a1, a4, a7
+; RV32ZBB-NEXT:    sub a4, a4, a7
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a5, a5, a1
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a6, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1647,8 +1647,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 24
-; RV32I-NEXT:    srai a2, a2, 24
 ; RV32I-NEXT:    slli a3, a1, 24
+; RV32I-NEXT:    srai a2, a2, 24
 ; RV32I-NEXT:    srai a3, a3, 24
 ; RV32I-NEXT:    bge a3, a2, .LBB18_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1661,8 +1661,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 56
-; RV64I-NEXT:    srai a2, a2, 56
 ; RV64I-NEXT:    slli a3, a1, 56
+; RV64I-NEXT:    srai a2, a2, 56
 ; RV64I-NEXT:    srai a3, a3, 56
 ; RV64I-NEXT:    bge a3, a2, .LBB18_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1694,8 +1694,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    srai a2, a2, 16
 ; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    srai a2, a2, 16
 ; RV32I-NEXT:    srai a3, a3, 16
 ; RV32I-NEXT:    blt a3, a2, .LBB19_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1708,8 +1708,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a1, 48
-; RV64I-NEXT:    srai a2, a2, 48
 ; RV64I-NEXT:    slli a3, a0, 48
+; RV64I-NEXT:    srai a2, a2, 48
 ; RV64I-NEXT:    srai a3, a3, 48
 ; RV64I-NEXT:    blt a3, a2, .LBB19_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1898,30 +1898,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t1, a5, a6
 ; RV32I-NEXT:    sub a7, a7, t0
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    sub a6, a5, a6
-; RV32I-NEXT:    sltu a5, a6, t5
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a6, a6, t5
+; RV32I-NEXT:    sub a5, a5, a6
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t5
+; RV32I-NEXT:    sub a1, a5, t5
+; RV32I-NEXT:    sub a5, a4, t4
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a6, a6, a5
-; RV32I-NEXT:    sub a5, a7, t1
-; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a5, a5, a7
-; RV32I-NEXT:    sub a6, a6, t3
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    sub a4, a1, a4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t3
+; RV32I-NEXT:    sub a1, a5, t3
+; RV32I-NEXT:    sub a5, a4, t2
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1985,30 +1985,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t1, a5, a6
 ; RV32ZBB-NEXT:    sub a7, a7, t0
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    sub a6, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a6, t5
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a6, a6, t5
+; RV32ZBB-NEXT:    sub a5, a5, a6
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t5
+; RV32ZBB-NEXT:    sub a1, a5, t5
+; RV32ZBB-NEXT:    sub a5, a4, t4
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a6, a6, a5
-; RV32ZBB-NEXT:    sub a5, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a5, a5, a7
-; RV32ZBB-NEXT:    sub a6, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    sub a4, a1, a4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t3
+; RV32ZBB-NEXT:    sub a1, a5, t3
+; RV32ZBB-NEXT:    sub a5, a4, t2
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a6, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
@@ -2289,12 +2289,12 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -2312,12 +2312,12 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
-; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
@@ -2340,12 +2340,12 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -2363,12 +2363,12 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
-; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
@@ -2392,64 +2392,64 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
 ; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a7, 12(a2)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a6, a5
-; RV32I-NEXT:    sub t0, t0, a7
-; RV32I-NEXT:    sltu a7, a2, a3
-; RV32I-NEXT:    sub t1, t0, t1
-; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    beq a1, a4, .LBB31_2
+; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    sltu a1, t0, a5
+; RV32I-NEXT:    sub t1, t1, a6
+; RV32I-NEXT:    sltu a6, a2, a3
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a7, a4, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a4
+; RV32I-NEXT:    sltu t1, a7, a4
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sltu a6, a5, t0
-; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub t1, a1, a7
-; RV32I-NEXT:    sub a4, a5, t0
+; RV32I-NEXT:    sub a5, t0, a5
+; RV32I-NEXT:    sub a4, a7, a4
 ; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    srai a1, a6, 31
+; RV32I-NEXT:    sltu a2, a5, t1
+; RV32I-NEXT:    sub t0, a4, a6
+; RV32I-NEXT:    sub a4, a5, t1
+; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    srai a1, a5, 31
 ; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    sltu a4, a1, a2
-; RV32I-NEXT:    xor a5, a6, a1
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a4, a5, a4
-; RV32I-NEXT:    xor a3, a3, a1
-; RV32I-NEXT:    sltu a5, a1, a3
-; RV32I-NEXT:    xor a6, t1, a1
-; RV32I-NEXT:    mv a7, a5
-; RV32I-NEXT:    beqz t1, .LBB31_4
+; RV32I-NEXT:    xor a5, a5, a1
+; RV32I-NEXT:    xor a4, a3, a1
+; RV32I-NEXT:    sltu a3, a1, a2
+; RV32I-NEXT:    sub a6, a1, a5
+; RV32I-NEXT:    sltu a5, a1, a4
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    xor a7, t0, a1
+; RV32I-NEXT:    mv a6, a5
+; RV32I-NEXT:    beqz t0, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu a7, a1, a6
+; RV32I-NEXT:    sltu a6, a1, a7
 ; RV32I-NEXT:  .LBB31_4:
 ; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    sltu t0, a2, a7
-; RV32I-NEXT:    sub a4, a4, t0
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a1, a6
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a7, a1, a7
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sltu a4, a2, a6
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_subnsw_i128:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a4, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    srai a2, a1, 63
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    sub a1, a2, a1
 ; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a0, a2, a0
@@ -2460,64 +2460,64 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
 ; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a6, a5
-; RV32ZBB-NEXT:    sub t0, t0, a7
-; RV32ZBB-NEXT:    sltu a7, a2, a3
-; RV32ZBB-NEXT:    sub t1, t0, t1
-; RV32ZBB-NEXT:    mv t0, a7
-; RV32ZBB-NEXT:    beq a1, a4, .LBB31_2
+; RV32ZBB-NEXT:    lw a7, 4(a1)
+; RV32ZBB-NEXT:    sltu a1, t0, a5
+; RV32ZBB-NEXT:    sub t1, t1, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a3
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a7, a4, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t0, a1, a4
+; RV32ZBB-NEXT:    sltu t1, a7, a4
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sltu a6, a5, t0
-; RV32ZBB-NEXT:    sub a6, t1, a6
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub t1, a1, a7
-; RV32ZBB-NEXT:    sub a4, a5, t0
+; RV32ZBB-NEXT:    sub a5, t0, a5
+; RV32ZBB-NEXT:    sub a4, a7, a4
 ; RV32ZBB-NEXT:    sub a3, a2, a3
-; RV32ZBB-NEXT:    srai a1, a6, 31
+; RV32ZBB-NEXT:    sltu a2, a5, t1
+; RV32ZBB-NEXT:    sub t0, a4, a6
+; RV32ZBB-NEXT:    sub a4, a5, t1
+; RV32ZBB-NEXT:    sub a5, a1, a2
+; RV32ZBB-NEXT:    srai a1, a5, 31
 ; RV32ZBB-NEXT:    xor a2, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a1, a2
-; RV32ZBB-NEXT:    xor a5, a6, a1
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a4, a5, a4
-; RV32ZBB-NEXT:    xor a3, a3, a1
-; RV32ZBB-NEXT:    sltu a5, a1, a3
-; RV32ZBB-NEXT:    xor a6, t1, a1
-; RV32ZBB-NEXT:    mv a7, a5
-; RV32ZBB-NEXT:    beqz t1, .LBB31_4
+; RV32ZBB-NEXT:    xor a5, a5, a1
+; RV32ZBB-NEXT:    xor a4, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a1, a2
+; RV32ZBB-NEXT:    sub a6, a1, a5
+; RV32ZBB-NEXT:    sltu a5, a1, a4
+; RV32ZBB-NEXT:    sub a3, a6, a3
+; RV32ZBB-NEXT:    xor a7, t0, a1
+; RV32ZBB-NEXT:    mv a6, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu a7, a1, a6
+; RV32ZBB-NEXT:    sltu a6, a1, a7
 ; RV32ZBB-NEXT:  .LBB31_4:
 ; RV32ZBB-NEXT:    sub a2, a1, a2
-; RV32ZBB-NEXT:    sltu t0, a2, a7
-; RV32ZBB-NEXT:    sub a4, a4, t0
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a1, a6
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a7, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sltu a4, a2, a6
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i128:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sltu a4, a0, a2
 ; RV64ZBB-NEXT:    sub a1, a1, a3
-; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    srai a2, a1, 63
 ; RV64ZBB-NEXT:    xor a0, a0, a2
-; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    xor a1, a1, a2
+; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    sub a1, a2, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a3
 ; RV64ZBB-NEXT:    sub a0, a2, a0
@@ -2534,64 +2534,64 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
 ; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a7, 12(a2)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a6, a5
-; RV32I-NEXT:    sub t0, t0, a7
-; RV32I-NEXT:    sltu a7, a2, a3
-; RV32I-NEXT:    sub t1, t0, t1
-; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    beq a1, a4, .LBB32_2
+; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    sltu a1, t0, a5
+; RV32I-NEXT:    sub t1, t1, a6
+; RV32I-NEXT:    sltu a6, a2, a3
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a7, a4, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a4
+; RV32I-NEXT:    sltu t1, a7, a4
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sltu a6, a5, t0
-; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub t1, a1, a7
-; RV32I-NEXT:    sub a4, a5, t0
+; RV32I-NEXT:    sub a5, t0, a5
+; RV32I-NEXT:    sub a4, a7, a4
 ; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    srai a1, a6, 31
+; RV32I-NEXT:    sltu a2, a5, t1
+; RV32I-NEXT:    sub t0, a4, a6
+; RV32I-NEXT:    sub a4, a5, t1
+; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    srai a1, a5, 31
 ; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    sltu a4, a1, a2
-; RV32I-NEXT:    xor a5, a6, a1
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a4, a5, a4
-; RV32I-NEXT:    xor a3, a3, a1
-; RV32I-NEXT:    sltu a5, a1, a3
-; RV32I-NEXT:    xor a6, t1, a1
-; RV32I-NEXT:    mv a7, a5
-; RV32I-NEXT:    beqz t1, .LBB32_4
+; RV32I-NEXT:    xor a5, a5, a1
+; RV32I-NEXT:    xor a4, a3, a1
+; RV32I-NEXT:    sltu a3, a1, a2
+; RV32I-NEXT:    sub a6, a1, a5
+; RV32I-NEXT:    sltu a5, a1, a4
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    xor a7, t0, a1
+; RV32I-NEXT:    mv a6, a5
+; RV32I-NEXT:    beqz t0, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu a7, a1, a6
+; RV32I-NEXT:    sltu a6, a1, a7
 ; RV32I-NEXT:  .LBB32_4:
 ; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    sltu t0, a2, a7
-; RV32I-NEXT:    sub a4, a4, t0
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a1, a6
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a7, a1, a7
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sltu a4, a2, a6
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_subnsw_i128_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a4, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    srai a2, a1, 63
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    sub a1, a2, a1
 ; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a0, a2, a0
@@ -2602,64 +2602,64 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
 ; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a6, a5
-; RV32ZBB-NEXT:    sub t0, t0, a7
-; RV32ZBB-NEXT:    sltu a7, a2, a3
-; RV32ZBB-NEXT:    sub t1, t0, t1
-; RV32ZBB-NEXT:    mv t0, a7
-; RV32ZBB-NEXT:    beq a1, a4, .LBB32_2
+; RV32ZBB-NEXT:    lw a7, 4(a1)
+; RV32ZBB-NEXT:    sltu a1, t0, a5
+; RV32ZBB-NEXT:    sub t1, t1, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a3
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a7, a4, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t0, a1, a4
+; RV32ZBB-NEXT:    sltu t1, a7, a4
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sltu a6, a5, t0
-; RV32ZBB-NEXT:    sub a6, t1, a6
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub t1, a1, a7
-; RV32ZBB-NEXT:    sub a4, a5, t0
+; RV32ZBB-NEXT:    sub a5, t0, a5
+; RV32ZBB-NEXT:    sub a4, a7, a4
 ; RV32ZBB-NEXT:    sub a3, a2, a3
-; RV32ZBB-NEXT:    srai a1, a6, 31
+; RV32ZBB-NEXT:    sltu a2, a5, t1
+; RV32ZBB-NEXT:    sub t0, a4, a6
+; RV32ZBB-NEXT:    sub a4, a5, t1
+; RV32ZBB-NEXT:    sub a5, a1, a2
+; RV32ZBB-NEXT:    srai a1, a5, 31
 ; RV32ZBB-NEXT:    xor a2, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a1, a2
-; RV32ZBB-NEXT:    xor a5, a6, a1
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a4, a5, a4
-; RV32ZBB-NEXT:    xor a3, a3, a1
-; RV32ZBB-NEXT:    sltu a5, a1, a3
-; RV32ZBB-NEXT:    xor a6, t1, a1
-; RV32ZBB-NEXT:    mv a7, a5
-; RV32ZBB-NEXT:    beqz t1, .LBB32_4
+; RV32ZBB-NEXT:    xor a5, a5, a1
+; RV32ZBB-NEXT:    xor a4, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a1, a2
+; RV32ZBB-NEXT:    sub a6, a1, a5
+; RV32ZBB-NEXT:    sltu a5, a1, a4
+; RV32ZBB-NEXT:    sub a3, a6, a3
+; RV32ZBB-NEXT:    xor a7, t0, a1
+; RV32ZBB-NEXT:    mv a6, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu a7, a1, a6
+; RV32ZBB-NEXT:    sltu a6, a1, a7
 ; RV32ZBB-NEXT:  .LBB32_4:
 ; RV32ZBB-NEXT:    sub a2, a1, a2
-; RV32ZBB-NEXT:    sltu t0, a2, a7
-; RV32ZBB-NEXT:    sub a4, a4, t0
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a1, a6
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a7, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sltu a4, a2, a6
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i128_undef:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sltu a4, a0, a2
 ; RV64ZBB-NEXT:    sub a1, a1, a3
-; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    srai a2, a1, 63
 ; RV64ZBB-NEXT:    xor a0, a0, a2
-; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    xor a1, a1, a2
+; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    sub a1, a2, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a3
 ; RV64ZBB-NEXT:    sub a0, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 8208eafbc205cc..56e6dacff97486 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -12,8 +12,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -24,8 +24,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -53,8 +53,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -65,8 +65,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -94,8 +94,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -106,8 +106,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -135,8 +135,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -147,8 +147,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -224,8 +224,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -236,8 +236,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -579,30 +579,30 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB11_13
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB11_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -675,30 +675,30 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB11_13
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB11_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -779,30 +779,30 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB12_13
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB12_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -875,30 +875,30 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB12_13
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB12_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -939,8 +939,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -951,8 +951,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -978,8 +978,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -990,8 +990,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1168,30 +1168,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB17_13
 ; RV32I-NEXT:  .LBB17_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB17_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1264,30 +1264,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB17_13
 ; RV32ZBB-NEXT:  .LBB17_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB17_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1326,8 +1326,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -1338,8 +1338,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1366,8 +1366,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -1378,8 +1378,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1559,30 +1559,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB22_13
 ; RV32I-NEXT:  .LBB22_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB22_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1655,30 +1655,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB22_13
 ; RV32ZBB-NEXT:  .LBB22_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB22_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -2045,47 +2045,47 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a3, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw t0, 12(a2)
-; RV32I-NEXT:    lw a2, 8(a1)
-; RV32I-NEXT:    lw t1, 12(a1)
-; RV32I-NEXT:    lw a5, 0(a1)
-; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    sltu a1, a2, a6
-; RV32I-NEXT:    sub t1, t1, t0
-; RV32I-NEXT:    sltu t0, a5, a4
-; RV32I-NEXT:    sub a1, t1, a1
-; RV32I-NEXT:    mv t1, t0
-; RV32I-NEXT:    beq a7, a3, .LBB31_2
+; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    sltu t1, a7, a5
+; RV32I-NEXT:    sub t0, t0, a6
+; RV32I-NEXT:    sltu a6, a2, a4
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a1, a3, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a7, a3
+; RV32I-NEXT:    sltu t1, a1, a3
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a2, a2, a6
-; RV32I-NEXT:    sltu a6, a2, t1
-; RV32I-NEXT:    sub a1, a1, a6
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    sub a3, a3, t0
-; RV32I-NEXT:    sub a4, a5, a4
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sltu a1, a5, t1
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a1, t0, a1
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    bgez a1, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    snez a5, a3
-; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    sltu t0, a7, a5
-; RV32I-NEXT:    snez a2, a2
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    sub a2, a7, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    snez a4, a3
+; RV32I-NEXT:    snez a6, a2
+; RV32I-NEXT:    neg a7, a5
+; RV32I-NEXT:    snez a5, a5
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    add a1, a1, a5
 ; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    sltu a6, a7, a4
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a5, a7, a4
+; RV32I-NEXT:    sub a1, a1, a6
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:  .LBB31_4:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -2108,47 +2108,47 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a3, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw t0, 12(a2)
-; RV32ZBB-NEXT:    lw a2, 8(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a2, a6
-; RV32ZBB-NEXT:    sub t1, t1, t0
-; RV32ZBB-NEXT:    sltu t0, a5, a4
-; RV32ZBB-NEXT:    sub a1, t1, a1
-; RV32ZBB-NEXT:    mv t1, t0
-; RV32ZBB-NEXT:    beq a7, a3, .LBB31_2
+; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu t1, a7, a5
+; RV32ZBB-NEXT:    sub t0, t0, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a4
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a1, a3, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a7, a3
+; RV32ZBB-NEXT:    sltu t1, a1, a3
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a2, a2, a6
-; RV32ZBB-NEXT:    sltu a6, a2, t1
-; RV32ZBB-NEXT:    sub a1, a1, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    sub a3, a3, t0
-; RV32ZBB-NEXT:    sub a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sltu a1, a5, t1
+; RV32ZBB-NEXT:    sub a5, a5, t1
+; RV32ZBB-NEXT:    sub a1, t0, a1
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    bgez a1, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    snez a5, a3
-; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    or a5, a6, a5
-; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    sltu t0, a7, a5
-; RV32ZBB-NEXT:    snez a2, a2
-; RV32ZBB-NEXT:    add a1, a1, a2
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t0
-; RV32ZBB-NEXT:    sub a2, a7, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    snez a4, a3
+; RV32ZBB-NEXT:    snez a6, a2
+; RV32ZBB-NEXT:    neg a7, a5
+; RV32ZBB-NEXT:    snez a5, a5
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    or a4, a6, a4
+; RV32ZBB-NEXT:    add a1, a1, a5
 ; RV32ZBB-NEXT:    add a3, a3, a6
+; RV32ZBB-NEXT:    sltu a6, a7, a4
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a5, a7, a4
+; RV32ZBB-NEXT:    sub a1, a1, a6
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:  .LBB31_4:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sw a5, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -2176,47 +2176,47 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a3, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw t0, 12(a2)
-; RV32I-NEXT:    lw a2, 8(a1)
-; RV32I-NEXT:    lw t1, 12(a1)
-; RV32I-NEXT:    lw a5, 0(a1)
-; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    sltu a1, a2, a6
-; RV32I-NEXT:    sub t1, t1, t0
-; RV32I-NEXT:    sltu t0, a5, a4
-; RV32I-NEXT:    sub a1, t1, a1
-; RV32I-NEXT:    mv t1, t0
-; RV32I-NEXT:    beq a7, a3, .LBB32_2
+; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    sltu t1, a7, a5
+; RV32I-NEXT:    sub t0, t0, a6
+; RV32I-NEXT:    sltu a6, a2, a4
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a1, a3, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a7, a3
+; RV32I-NEXT:    sltu t1, a1, a3
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a2, a2, a6
-; RV32I-NEXT:    sltu a6, a2, t1
-; RV32I-NEXT:    sub a1, a1, a6
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    sub a3, a3, t0
-; RV32I-NEXT:    sub a4, a5, a4
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sltu a1, a5, t1
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a1, t0, a1
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    bgez a1, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    snez a5, a3
-; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    sltu t0, a7, a5
-; RV32I-NEXT:    snez a2, a2
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    sub a2, a7, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    snez a4, a3
+; RV32I-NEXT:    snez a6, a2
+; RV32I-NEXT:    neg a7, a5
+; RV32I-NEXT:    snez a5, a5
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    add a1, a1, a5
 ; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    sltu a6, a7, a4
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a5, a7, a4
+; RV32I-NEXT:    sub a1, a1, a6
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:  .LBB32_4:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -2239,47 +2239,47 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a3, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw t0, 12(a2)
-; RV32ZBB-NEXT:    lw a2, 8(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a2, a6
-; RV32ZBB-NEXT:    sub t1, t1, t0
-; RV32ZBB-NEXT:    sltu t0, a5, a4
-; RV32ZBB-NEXT:    sub a1, t1, a1
-; RV32ZBB-NEXT:    mv t1, t0
-; RV32ZBB-NEXT:    beq a7, a3, .LBB32_2
+; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu t1, a7, a5
+; RV32ZBB-NEXT:    sub t0, t0, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a4
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a1, a3, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a7, a3
+; RV32ZBB-NEXT:    sltu t1, a1, a3
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a2, a2, a6
-; RV32ZBB-NEXT:    sltu a6, a2, t1
-; RV32ZBB-NEXT:    sub a1, a1, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    sub a3, a3, t0
-; RV32ZBB-NEXT:    sub a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sltu a1, a5, t1
+; RV32ZBB-NEXT:    sub a5, a5, t1
+; RV32ZBB-NEXT:    sub a1, t0, a1
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    bgez a1, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    snez a5, a3
-; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    or a5, a6, a5
-; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    sltu t0, a7, a5
-; RV32ZBB-NEXT:    snez a2, a2
-; RV32ZBB-NEXT:    add a1, a1, a2
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t0
-; RV32ZBB-NEXT:    sub a2, a7, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    snez a4, a3
+; RV32ZBB-NEXT:    snez a6, a2
+; RV32ZBB-NEXT:    neg a7, a5
+; RV32ZBB-NEXT:    snez a5, a5
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    or a4, a6, a4
+; RV32ZBB-NEXT:    add a1, a1, a5
 ; RV32ZBB-NEXT:    add a3, a3, a6
+; RV32ZBB-NEXT:    sltu a6, a7, a4
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a5, a7, a4
+; RV32ZBB-NEXT:    sub a1, a1, a6
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:  .LBB32_4:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sw a5, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -2349,8 +2349,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_select_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -2361,8 +2361,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_select_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -2389,8 +2389,8 @@ define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_select_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -2401,8 +2401,8 @@ define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_select_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -2582,30 +2582,30 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB38_13
 ; RV32I-NEXT:  .LBB38_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB38_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -2678,30 +2678,30 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB38_13
 ; RV32ZBB-NEXT:  .LBB38_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB38_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll
index b39285c3d343f5..9e41cde7ae181e 100644
--- a/llvm/test/CodeGen/RISCV/abdu-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll
@@ -220,8 +220,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -322,8 +322,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -341,8 +341,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    neg a1, a0
@@ -375,8 +375,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -426,8 +426,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -445,8 +445,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32_undef:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    neg a1, a0
@@ -477,13 +477,13 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB9_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    snez a1, a0
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
@@ -515,13 +515,13 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB9_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
 ; RV32ZBB-NEXT:    add a0, a2, a0
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sub a3, a3, a1
+; RV32ZBB-NEXT:    snez a1, a0
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
@@ -557,13 +557,13 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB10_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    snez a1, a0
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
@@ -595,13 +595,13 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB10_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
 ; RV32ZBB-NEXT:    add a0, a2, a0
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sub a3, a3, a1
+; RV32ZBB-NEXT:    snez a1, a0
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
@@ -624,87 +624,87 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 0(a2)
-; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw t1, 8(a2)
-; RV32I-NEXT:    lw a2, 12(a2)
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a3, t1
-; RV32I-NEXT:    sub a2, a5, a2
-; RV32I-NEXT:    sltu t2, a7, a4
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    beq t0, a6, .LBB11_2
+; RV32I-NEXT:    sltu a1, a4, a3
+; RV32I-NEXT:    sub t1, a6, t1
+; RV32I-NEXT:    sltu t2, a2, a5
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, t2
+; RV32I-NEXT:    beq t0, a7, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, t0, a6
+; RV32I-NEXT:    sltu t1, t0, a7
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sub t1, a3, t1
-; RV32I-NEXT:    sltu t3, t1, a2
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sltu t3, a3, t1
 ; RV32I-NEXT:    sub a1, a1, t3
-; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    beq a1, a5, .LBB11_4
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    beq a1, a6, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a5, a1
+; RV32I-NEXT:    sltu t1, a6, a1
 ; RV32I-NEXT:    j .LBB11_5
 ; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    sltu t1, a3, a2
+; RV32I-NEXT:    sltu t1, a4, a3
 ; RV32I-NEXT:  .LBB11_5:
-; RV32I-NEXT:    sub a6, t0, a6
-; RV32I-NEXT:    sub a6, a6, t2
-; RV32I-NEXT:    sub t2, a7, a4
-; RV32I-NEXT:    beq a6, t0, .LBB11_7
+; RV32I-NEXT:    sub a7, t0, a7
+; RV32I-NEXT:    sub a7, a7, t2
+; RV32I-NEXT:    sub a5, a2, a5
+; RV32I-NEXT:    beq a7, t0, .LBB11_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a4, t0, a6
+; RV32I-NEXT:    sltu a2, t0, a7
 ; RV32I-NEXT:    j .LBB11_8
 ; RV32I-NEXT:  .LBB11_7:
-; RV32I-NEXT:    sltu a4, a7, t2
+; RV32I-NEXT:    sltu a2, a2, a5
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    xor a5, a1, a5
-; RV32I-NEXT:    xor a3, a2, a3
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    beqz a3, .LBB11_10
+; RV32I-NEXT:    xor a6, a1, a6
+; RV32I-NEXT:    xor a4, a3, a4
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    beqz a4, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a4, t1
+; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    neg t0, a4
-; RV32I-NEXT:    xor a5, t2, t0
-; RV32I-NEXT:    sltu t2, a5, t0
-; RV32I-NEXT:    xor t3, a6, t0
-; RV32I-NEXT:    add a3, t3, a4
-; RV32I-NEXT:    sub a3, a3, t2
-; RV32I-NEXT:    snez t1, a3
-; RV32I-NEXT:    add a5, a5, a4
-; RV32I-NEXT:    snez a7, a5
-; RV32I-NEXT:    or t1, a7, t1
-; RV32I-NEXT:    beqz a6, .LBB11_12
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    xor t0, a5, a4
+; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    sltu a5, t0, a4
+; RV32I-NEXT:    add a6, t3, a2
+; RV32I-NEXT:    add t0, t0, a2
+; RV32I-NEXT:    sub t1, a6, a5
+; RV32I-NEXT:    snez a6, t1
+; RV32I-NEXT:    snez t2, t0
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    beqz a7, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t2, t3, t0
+; RV32I-NEXT:    sltu a5, t3, a4
 ; RV32I-NEXT:  .LBB11_12:
-; RV32I-NEXT:    xor a2, a2, t0
-; RV32I-NEXT:    add a6, a2, a4
-; RV32I-NEXT:    sub t3, a6, t2
-; RV32I-NEXT:    neg t4, t3
-; RV32I-NEXT:    sltu t5, t4, t1
-; RV32I-NEXT:    sltu a2, a2, t0
-; RV32I-NEXT:    xor a1, a1, t0
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sltu a2, a6, t2
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    snez a2, t3
+; RV32I-NEXT:    xor a3, a3, a4
+; RV32I-NEXT:    xor a1, a1, a4
+; RV32I-NEXT:    add t1, t1, t2
+; RV32I-NEXT:    neg a7, t0
+; RV32I-NEXT:    add t0, a3, a2
+; RV32I-NEXT:    sltu a3, a3, a4
 ; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    neg a2, t1
+; RV32I-NEXT:    sub a4, t0, a5
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sltu a3, t0, a5
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    snez a3, a4
+; RV32I-NEXT:    sltu a4, a5, a6
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t5
-; RV32I-NEXT:    sub a2, t4, t1
-; RV32I-NEXT:    add a3, a3, a7
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    neg a4, a5
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sw a7, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -723,100 +723,100 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB11_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
 ; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    snez a2, a0
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    sub a3, a3, a1
+; RV64I-NEXT:    snez a1, a0
+; RV64I-NEXT:    add a1, a3, a1
 ; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    neg a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a4, 0(a2)
-; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw t1, 8(a2)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
-; RV32ZBB-NEXT:    lw a3, 8(a1)
-; RV32ZBB-NEXT:    lw a5, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a7, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a3, t1
-; RV32ZBB-NEXT:    sub a2, a5, a2
-; RV32ZBB-NEXT:    sltu t2, a7, a4
-; RV32ZBB-NEXT:    sub a1, a2, a1
-; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    beq t0, a6, .LBB11_2
+; RV32ZBB-NEXT:    sltu a1, a4, a3
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a2, a5
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq t0, a7, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, t0, a6
+; RV32ZBB-NEXT:    sltu t1, t0, a7
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sub t1, a3, t1
-; RV32ZBB-NEXT:    sltu t3, t1, a2
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sltu t3, a3, t1
 ; RV32ZBB-NEXT:    sub a1, a1, t3
-; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB11_4
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    beq a1, a6, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a5, a1
+; RV32ZBB-NEXT:    sltu t1, a6, a1
 ; RV32ZBB-NEXT:    j .LBB11_5
 ; RV32ZBB-NEXT:  .LBB11_4:
-; RV32ZBB-NEXT:    sltu t1, a3, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a3
 ; RV32ZBB-NEXT:  .LBB11_5:
-; RV32ZBB-NEXT:    sub a6, t0, a6
-; RV32ZBB-NEXT:    sub a6, a6, t2
-; RV32ZBB-NEXT:    sub t2, a7, a4
-; RV32ZBB-NEXT:    beq a6, t0, .LBB11_7
+; RV32ZBB-NEXT:    sub a7, t0, a7
+; RV32ZBB-NEXT:    sub a7, a7, t2
+; RV32ZBB-NEXT:    sub a5, a2, a5
+; RV32ZBB-NEXT:    beq a7, t0, .LBB11_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a4, t0, a6
+; RV32ZBB-NEXT:    sltu a2, t0, a7
 ; RV32ZBB-NEXT:    j .LBB11_8
 ; RV32ZBB-NEXT:  .LBB11_7:
-; RV32ZBB-NEXT:    sltu a4, a7, t2
+; RV32ZBB-NEXT:    sltu a2, a2, a5
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    xor a5, a1, a5
-; RV32ZBB-NEXT:    xor a3, a2, a3
-; RV32ZBB-NEXT:    or a3, a3, a5
-; RV32ZBB-NEXT:    beqz a3, .LBB11_10
+; RV32ZBB-NEXT:    xor a6, a1, a6
+; RV32ZBB-NEXT:    xor a4, a3, a4
+; RV32ZBB-NEXT:    or a4, a4, a6
+; RV32ZBB-NEXT:    beqz a4, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a4, t1
+; RV32ZBB-NEXT:    mv a2, t1
 ; RV32ZBB-NEXT:  .LBB11_10:
-; RV32ZBB-NEXT:    neg t0, a4
-; RV32ZBB-NEXT:    xor a5, t2, t0
-; RV32ZBB-NEXT:    sltu t2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a6, t0
-; RV32ZBB-NEXT:    add a3, t3, a4
-; RV32ZBB-NEXT:    sub a3, a3, t2
-; RV32ZBB-NEXT:    snez t1, a3
-; RV32ZBB-NEXT:    add a5, a5, a4
-; RV32ZBB-NEXT:    snez a7, a5
-; RV32ZBB-NEXT:    or t1, a7, t1
-; RV32ZBB-NEXT:    beqz a6, .LBB11_12
+; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    xor t0, a5, a4
+; RV32ZBB-NEXT:    xor t3, a7, a4
+; RV32ZBB-NEXT:    sltu a5, t0, a4
+; RV32ZBB-NEXT:    add a6, t3, a2
+; RV32ZBB-NEXT:    add t0, t0, a2
+; RV32ZBB-NEXT:    sub t1, a6, a5
+; RV32ZBB-NEXT:    snez a6, t1
+; RV32ZBB-NEXT:    snez t2, t0
+; RV32ZBB-NEXT:    or a6, t2, a6
+; RV32ZBB-NEXT:    beqz a7, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t2, t3, t0
+; RV32ZBB-NEXT:    sltu a5, t3, a4
 ; RV32ZBB-NEXT:  .LBB11_12:
-; RV32ZBB-NEXT:    xor a2, a2, t0
-; RV32ZBB-NEXT:    add a6, a2, a4
-; RV32ZBB-NEXT:    sub t3, a6, t2
-; RV32ZBB-NEXT:    neg t4, t3
-; RV32ZBB-NEXT:    sltu t5, t4, t1
-; RV32ZBB-NEXT:    sltu a2, a2, t0
-; RV32ZBB-NEXT:    xor a1, a1, t0
-; RV32ZBB-NEXT:    add a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sltu a2, a6, t2
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    snez a2, t3
+; RV32ZBB-NEXT:    xor a3, a3, a4
+; RV32ZBB-NEXT:    xor a1, a1, a4
+; RV32ZBB-NEXT:    add t1, t1, t2
+; RV32ZBB-NEXT:    neg a7, t0
+; RV32ZBB-NEXT:    add t0, a3, a2
+; RV32ZBB-NEXT:    sltu a3, a3, a4
 ; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    neg a2, t1
+; RV32ZBB-NEXT:    sub a4, t0, a5
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sltu a3, t0, a5
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    snez a3, a4
+; RV32ZBB-NEXT:    sltu a4, a5, a6
+; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t5
-; RV32ZBB-NEXT:    sub a2, t4, t1
-; RV32ZBB-NEXT:    add a3, a3, a7
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    neg a4, a5
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sw a7, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -835,13 +835,13 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB11_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
 ; RV64ZBB-NEXT:    add a0, a2, a0
-; RV64ZBB-NEXT:    snez a2, a0
-; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    sub a3, a3, a1
+; RV64ZBB-NEXT:    snez a1, a0
+; RV64ZBB-NEXT:    add a1, a3, a1
 ; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    neg a0, a0
 ; RV64ZBB-NEXT:    ret
@@ -857,87 +857,87 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 0(a2)
-; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw t1, 8(a2)
-; RV32I-NEXT:    lw a2, 12(a2)
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a3, t1
-; RV32I-NEXT:    sub a2, a5, a2
-; RV32I-NEXT:    sltu t2, a7, a4
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    beq t0, a6, .LBB12_2
+; RV32I-NEXT:    sltu a1, a4, a3
+; RV32I-NEXT:    sub t1, a6, t1
+; RV32I-NEXT:    sltu t2, a2, a5
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, t2
+; RV32I-NEXT:    beq t0, a7, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, t0, a6
+; RV32I-NEXT:    sltu t1, t0, a7
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sub t1, a3, t1
-; RV32I-NEXT:    sltu t3, t1, a2
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sltu t3, a3, t1
 ; RV32I-NEXT:    sub a1, a1, t3
-; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    beq a1, a5, .LBB12_4
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    beq a1, a6, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a5, a1
+; RV32I-NEXT:    sltu t1, a6, a1
 ; RV32I-NEXT:    j .LBB12_5
 ; RV32I-NEXT:  .LBB12_4:
-; RV32I-NEXT:    sltu t1, a3, a2
+; RV32I-NEXT:    sltu t1, a4, a3
 ; RV32I-NEXT:  .LBB12_5:
-; RV32I-NEXT:    sub a6, t0, a6
-; RV32I-NEXT:    sub a6, a6, t2
-; RV32I-NEXT:    sub t2, a7, a4
-; RV32I-NEXT:    beq a6, t0, .LBB12_7
+; RV32I-NEXT:    sub a7, t0, a7
+; RV32I-NEXT:    sub a7, a7, t2
+; RV32I-NEXT:    sub a5, a2, a5
+; RV32I-NEXT:    beq a7, t0, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a4, t0, a6
+; RV32I-NEXT:    sltu a2, t0, a7
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
-; RV32I-NEXT:    sltu a4, a7, t2
+; RV32I-NEXT:    sltu a2, a2, a5
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    xor a5, a1, a5
-; RV32I-NEXT:    xor a3, a2, a3
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    beqz a3, .LBB12_10
+; RV32I-NEXT:    xor a6, a1, a6
+; RV32I-NEXT:    xor a4, a3, a4
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    beqz a4, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a4, t1
+; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:  .LBB12_10:
-; RV32I-NEXT:    neg t0, a4
-; RV32I-NEXT:    xor a5, t2, t0
-; RV32I-NEXT:    sltu t2, a5, t0
-; RV32I-NEXT:    xor t3, a6, t0
-; RV32I-NEXT:    add a3, t3, a4
-; RV32I-NEXT:    sub a3, a3, t2
-; RV32I-NEXT:    snez t1, a3
-; RV32I-NEXT:    add a5, a5, a4
-; RV32I-NEXT:    snez a7, a5
-; RV32I-NEXT:    or t1, a7, t1
-; RV32I-NEXT:    beqz a6, .LBB12_12
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    xor t0, a5, a4
+; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    sltu a5, t0, a4
+; RV32I-NEXT:    add a6, t3, a2
+; RV32I-NEXT:    add t0, t0, a2
+; RV32I-NEXT:    sub t1, a6, a5
+; RV32I-NEXT:    snez a6, t1
+; RV32I-NEXT:    snez t2, t0
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    beqz a7, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t2, t3, t0
+; RV32I-NEXT:    sltu a5, t3, a4
 ; RV32I-NEXT:  .LBB12_12:
-; RV32I-NEXT:    xor a2, a2, t0
-; RV32I-NEXT:    add a6, a2, a4
-; RV32I-NEXT:    sub t3, a6, t2
-; RV32I-NEXT:    neg t4, t3
-; RV32I-NEXT:    sltu t5, t4, t1
-; RV32I-NEXT:    sltu a2, a2, t0
-; RV32I-NEXT:    xor a1, a1, t0
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sltu a2, a6, t2
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    snez a2, t3
+; RV32I-NEXT:    xor a3, a3, a4
+; RV32I-NEXT:    xor a1, a1, a4
+; RV32I-NEXT:    add t1, t1, t2
+; RV32I-NEXT:    neg a7, t0
+; RV32I-NEXT:    add t0, a3, a2
+; RV32I-NEXT:    sltu a3, a3, a4
 ; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    neg a2, t1
+; RV32I-NEXT:    sub a4, t0, a5
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sltu a3, t0, a5
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    snez a3, a4
+; RV32I-NEXT:    sltu a4, a5, a6
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t5
-; RV32I-NEXT:    sub a2, t4, t1
-; RV32I-NEXT:    add a3, a3, a7
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    neg a4, a5
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sw a7, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -956,100 +956,100 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB12_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
 ; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    snez a2, a0
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    sub a3, a3, a1
+; RV64I-NEXT:    snez a1, a0
+; RV64I-NEXT:    add a1, a3, a1
 ; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    neg a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a4, 0(a2)
-; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw t1, 8(a2)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
-; RV32ZBB-NEXT:    lw a3, 8(a1)
-; RV32ZBB-NEXT:    lw a5, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a7, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a3, t1
-; RV32ZBB-NEXT:    sub a2, a5, a2
-; RV32ZBB-NEXT:    sltu t2, a7, a4
-; RV32ZBB-NEXT:    sub a1, a2, a1
-; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    beq t0, a6, .LBB12_2
+; RV32ZBB-NEXT:    sltu a1, a4, a3
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a2, a5
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq t0, a7, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, t0, a6
+; RV32ZBB-NEXT:    sltu t1, t0, a7
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sub t1, a3, t1
-; RV32ZBB-NEXT:    sltu t3, t1, a2
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sltu t3, a3, t1
 ; RV32ZBB-NEXT:    sub a1, a1, t3
-; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB12_4
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    beq a1, a6, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a5, a1
+; RV32ZBB-NEXT:    sltu t1, a6, a1
 ; RV32ZBB-NEXT:    j .LBB12_5
 ; RV32ZBB-NEXT:  .LBB12_4:
-; RV32ZBB-NEXT:    sltu t1, a3, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a3
 ; RV32ZBB-NEXT:  .LBB12_5:
-; RV32ZBB-NEXT:    sub a6, t0, a6
-; RV32ZBB-NEXT:    sub a6, a6, t2
-; RV32ZBB-NEXT:    sub t2, a7, a4
-; RV32ZBB-NEXT:    beq a6, t0, .LBB12_7
+; RV32ZBB-NEXT:    sub a7, t0, a7
+; RV32ZBB-NEXT:    sub a7, a7, t2
+; RV32ZBB-NEXT:    sub a5, a2, a5
+; RV32ZBB-NEXT:    beq a7, t0, .LBB12_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a4, t0, a6
+; RV32ZBB-NEXT:    sltu a2, t0, a7
 ; RV32ZBB-NEXT:    j .LBB12_8
 ; RV32ZBB-NEXT:  .LBB12_7:
-; RV32ZBB-NEXT:    sltu a4, a7, t2
+; RV32ZBB-NEXT:    sltu a2, a2, a5
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    xor a5, a1, a5
-; RV32ZBB-NEXT:    xor a3, a2, a3
-; RV32ZBB-NEXT:    or a3, a3, a5
-; RV32ZBB-NEXT:    beqz a3, .LBB12_10
+; RV32ZBB-NEXT:    xor a6, a1, a6
+; RV32ZBB-NEXT:    xor a4, a3, a4
+; RV32ZBB-NEXT:    or a4, a4, a6
+; RV32ZBB-NEXT:    beqz a4, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a4, t1
+; RV32ZBB-NEXT:    mv a2, t1
 ; RV32ZBB-NEXT:  .LBB12_10:
-; RV32ZBB-NEXT:    neg t0, a4
-; RV32ZBB-NEXT:    xor a5, t2, t0
-; RV32ZBB-NEXT:    sltu t2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a6, t0
-; RV32ZBB-NEXT:    add a3, t3, a4
-; RV32ZBB-NEXT:    sub a3, a3, t2
-; RV32ZBB-NEXT:    snez t1, a3
-; RV32ZBB-NEXT:    add a5, a5, a4
-; RV32ZBB-NEXT:    snez a7, a5
-; RV32ZBB-NEXT:    or t1, a7, t1
-; RV32ZBB-NEXT:    beqz a6, .LBB12_12
+; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    xor t0, a5, a4
+; RV32ZBB-NEXT:    xor t3, a7, a4
+; RV32ZBB-NEXT:    sltu a5, t0, a4
+; RV32ZBB-NEXT:    add a6, t3, a2
+; RV32ZBB-NEXT:    add t0, t0, a2
+; RV32ZBB-NEXT:    sub t1, a6, a5
+; RV32ZBB-NEXT:    snez a6, t1
+; RV32ZBB-NEXT:    snez t2, t0
+; RV32ZBB-NEXT:    or a6, t2, a6
+; RV32ZBB-NEXT:    beqz a7, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t2, t3, t0
+; RV32ZBB-NEXT:    sltu a5, t3, a4
 ; RV32ZBB-NEXT:  .LBB12_12:
-; RV32ZBB-NEXT:    xor a2, a2, t0
-; RV32ZBB-NEXT:    add a6, a2, a4
-; RV32ZBB-NEXT:    sub t3, a6, t2
-; RV32ZBB-NEXT:    neg t4, t3
-; RV32ZBB-NEXT:    sltu t5, t4, t1
-; RV32ZBB-NEXT:    sltu a2, a2, t0
-; RV32ZBB-NEXT:    xor a1, a1, t0
-; RV32ZBB-NEXT:    add a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sltu a2, a6, t2
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    snez a2, t3
+; RV32ZBB-NEXT:    xor a3, a3, a4
+; RV32ZBB-NEXT:    xor a1, a1, a4
+; RV32ZBB-NEXT:    add t1, t1, t2
+; RV32ZBB-NEXT:    neg a7, t0
+; RV32ZBB-NEXT:    add t0, a3, a2
+; RV32ZBB-NEXT:    sltu a3, a3, a4
 ; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    neg a2, t1
+; RV32ZBB-NEXT:    sub a4, t0, a5
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sltu a3, t0, a5
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    snez a3, a4
+; RV32ZBB-NEXT:    sltu a4, a5, a6
+; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t5
-; RV32ZBB-NEXT:    sub a2, t4, t1
-; RV32ZBB-NEXT:    add a3, a3, a7
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    neg a4, a5
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sw a7, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1068,13 +1068,13 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB12_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
 ; RV64ZBB-NEXT:    add a0, a2, a0
-; RV64ZBB-NEXT:    snez a2, a0
-; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    sub a3, a3, a1
+; RV64ZBB-NEXT:    snez a1, a0
+; RV64ZBB-NEXT:    add a1, a3, a1
 ; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    neg a0, a0
 ; RV64ZBB-NEXT:    ret
@@ -1402,26 +1402,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    mv a4, a7
 ; RV32I-NEXT:  .LBB17_19:
-; RV32I-NEXT:    sltu a6, t3, a4
-; RV32I-NEXT:    sub a7, t4, a5
-; RV32I-NEXT:    sltu a5, a2, a1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    sltu a7, t3, a4
+; RV32I-NEXT:    sub a5, t4, a5
+; RV32I-NEXT:    sltu a6, a2, a1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beq t1, a3, .LBB17_21
 ; RV32I-NEXT:  # %bb.20:
 ; RV32I-NEXT:    sltu a7, t1, a3
 ; RV32I-NEXT:  .LBB17_21:
 ; RV32I-NEXT:    sub a4, t3, a4
-; RV32I-NEXT:    sltu t0, a4, a7
-; RV32I-NEXT:    sub a6, a6, t0
-; RV32I-NEXT:    sub a4, a4, a7
 ; RV32I-NEXT:    sub a3, t1, a3
-; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a2, a2, a1
+; RV32I-NEXT:    sltu a1, a4, a7
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a5, a5, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1529,26 +1529,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    mv a5, t0
 ; RV32ZBB-NEXT:    mv a4, a7
 ; RV32ZBB-NEXT:  .LBB17_19:
-; RV32ZBB-NEXT:    sltu a6, t3, a4
-; RV32ZBB-NEXT:    sub a7, t4, a5
-; RV32ZBB-NEXT:    sltu a5, a2, a1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    mv a7, a5
+; RV32ZBB-NEXT:    sltu a7, t3, a4
+; RV32ZBB-NEXT:    sub a5, t4, a5
+; RV32ZBB-NEXT:    sltu a6, a2, a1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beq t1, a3, .LBB17_21
 ; RV32ZBB-NEXT:  # %bb.20:
 ; RV32ZBB-NEXT:    sltu a7, t1, a3
 ; RV32ZBB-NEXT:  .LBB17_21:
 ; RV32ZBB-NEXT:    sub a4, t3, a4
-; RV32ZBB-NEXT:    sltu t0, a4, a7
-; RV32ZBB-NEXT:    sub a6, a6, t0
-; RV32ZBB-NEXT:    sub a4, a4, a7
 ; RV32ZBB-NEXT:    sub a3, t1, a3
-; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a2, a2, a1
+; RV32ZBB-NEXT:    sltu a1, a4, a7
+; RV32ZBB-NEXT:    sub a4, a4, a7
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a5, a5, a1
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a6, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1835,30 +1835,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t1, a5, a6
 ; RV32I-NEXT:    sub a7, a7, t0
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    sub a6, a5, a6
-; RV32I-NEXT:    sltu a5, a6, t5
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a6, a6, t5
+; RV32I-NEXT:    sub a5, a5, a6
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t5
+; RV32I-NEXT:    sub a1, a5, t5
+; RV32I-NEXT:    sub a5, a4, t4
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a6, a6, a5
-; RV32I-NEXT:    sub a5, a7, t1
-; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a5, a5, a7
-; RV32I-NEXT:    sub a6, a6, t3
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    sub a4, a1, a4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t3
+; RV32I-NEXT:    sub a1, a5, t3
+; RV32I-NEXT:    sub a5, a4, t2
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1922,30 +1922,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t1, a5, a6
 ; RV32ZBB-NEXT:    sub a7, a7, t0
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    sub a6, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a6, t5
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a6, a6, t5
+; RV32ZBB-NEXT:    sub a5, a5, a6
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t5
+; RV32ZBB-NEXT:    sub a1, a5, t5
+; RV32ZBB-NEXT:    sub a5, a4, t4
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a6, a6, a5
-; RV32ZBB-NEXT:    sub a5, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a5, a5, a7
-; RV32ZBB-NEXT:    sub a6, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    sub a4, a1, a4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t3
+; RV32ZBB-NEXT:    sub a1, a5, t3
+; RV32ZBB-NEXT:    sub a5, a4, t2
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a6, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index 814bca98523ce8..7c8638cb461e26 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -178,8 +178,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -198,8 +198,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i16_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    sub a0, a0, a2
@@ -267,8 +267,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -286,8 +286,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -317,8 +317,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -337,8 +337,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32_i16:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    zext.h a1, a1
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    sub a0, a0, a2
@@ -365,8 +365,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -384,8 +384,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32_undef:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -415,10 +415,10 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB9_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -447,10 +447,10 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB9_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -484,10 +484,10 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB10_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -516,10 +516,10 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB10_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -587,29 +587,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB11_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_ext_i128:
@@ -627,10 +627,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB11_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -683,29 +683,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB11_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_ext_i128:
@@ -723,10 +723,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB11_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %aext = zext i128 %a to i256
@@ -787,29 +787,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB12_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_ext_i128_undef:
@@ -827,10 +827,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB12_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -883,29 +883,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB12_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_ext_i128_undef:
@@ -923,10 +923,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB12_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %aext = zext i128 %a to i256
@@ -1029,8 +1029,8 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1048,8 +1048,8 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_minmax_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -1077,10 +1077,10 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB16_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -1109,10 +1109,10 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB16_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1178,29 +1178,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB17_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB17_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_minmax_i128:
@@ -1218,10 +1218,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB17_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -1274,29 +1274,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB17_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB17_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB17_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_minmax_i128:
@@ -1314,10 +1314,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB17_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %min = call i128 @llvm.umin.i128(i128 %a, i128 %b)
@@ -1420,8 +1420,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1439,8 +1439,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_cmp_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -1469,10 +1469,10 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB21_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -1501,10 +1501,10 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB21_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1571,29 +1571,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB22_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB22_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1611,10 +1611,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB22_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -1667,29 +1667,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB22_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB22_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
@@ -1707,10 +1707,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB22_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp uge i128 %a, %b
@@ -1814,8 +1814,8 @@ define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_select_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1833,8 +1833,8 @@ define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_select_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -1863,10 +1863,10 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB26_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -1895,10 +1895,10 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB26_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1965,29 +1965,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB27_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB27_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB27_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_select_i128:
@@ -2005,10 +2005,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB27_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -2061,29 +2061,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB27_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB27_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB27_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_select_i128:
@@ -2101,10 +2101,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB27_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp ult i128 %a, %b
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index db7498340d3951..5d4478f9d4b5f0 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -171,16 +171,16 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srli a5, a2, 29
 ; RV32I-NEXT:    slli a6, a3, 3
-; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    srli a3, a3, 29
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a6, a4, 3
 ; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lui a6, 128
 ; RV32I-NEXT:    srli a4, a4, 29
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a2, a2, 3
-; RV32I-NEXT:    lui a4, 128
-; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    add a1, a1, a6
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a3, 8(a0)
@@ -191,8 +191,8 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 61
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    addi a2, zero, 1
 ; RV64I-NEXT:    slli a2, a2, 51
 ; RV64I-NEXT:    add a1, a1, a2
@@ -200,23 +200,23 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    c.lw a2, 12(a1)
-; RV32C-NEXT:    lw a6, 0(a1)
-; RV32C-NEXT:    c.lw a3, 4(a1)
+; RV32C-NEXT:    c.lw a4, 12(a1)
+; RV32C-NEXT:    c.lw a3, 0(a1)
+; RV32C-NEXT:    c.lw a2, 4(a1)
 ; RV32C-NEXT:    c.lw a1, 8(a1)
 ; RV32C-NEXT:    c.lui a5, 16
-; RV32C-NEXT:    c.add a2, a5
-; RV32C-NEXT:    c.slli a2, 3
-; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a2, a5
-; RV32C-NEXT:    srli a5, a6, 29
-; RV32C-NEXT:    slli a4, a3, 3
+; RV32C-NEXT:    add a6, a4, a5
+; RV32C-NEXT:    srli a5, a3, 29
+; RV32C-NEXT:    slli a4, a2, 3
 ; RV32C-NEXT:    c.or a4, a5
-; RV32C-NEXT:    c.srli a3, 29
+; RV32C-NEXT:    srli a5, a1, 29
+; RV32C-NEXT:    c.srli a2, 29
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a3
+; RV32C-NEXT:    c.slli a3, 3
 ; RV32C-NEXT:    c.slli a6, 3
-; RV32C-NEXT:    sw a6, 0(a0)
+; RV32C-NEXT:    c.or a1, a2
+; RV32C-NEXT:    or a2, a6, a5
+; RV32C-NEXT:    c.sw a3, 0(a0)
 ; RV32C-NEXT:    c.sw a4, 4(a0)
 ; RV32C-NEXT:    c.sw a1, 8(a0)
 ; RV32C-NEXT:    c.sw a2, 12(a0)
@@ -226,8 +226,8 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV64C:       # %bb.0:
 ; RV64C-NEXT:    srli a2, a0, 61
 ; RV64C-NEXT:    c.slli a1, 3
-; RV64C-NEXT:    c.or a1, a2
 ; RV64C-NEXT:    c.slli a0, 3
+; RV64C-NEXT:    c.or a1, a2
 ; RV64C-NEXT:    c.li a2, 1
 ; RV64C-NEXT:    c.slli a2, 51
 ; RV64C-NEXT:    c.add a1, a2
diff --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll
index 52751f1c224211..84deb4c00ac8d1 100644
--- a/llvm/test/CodeGen/RISCV/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/add-imm.ll
@@ -213,29 +213,29 @@ define void @add32_reject() nounwind {
 ; RV32I-LABEL: add32_reject:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(ga)
-; RV32I-NEXT:    lw a1, %lo(ga)(a0)
-; RV32I-NEXT:    lui a2, %hi(gb)
-; RV32I-NEXT:    lw a3, %lo(gb)(a2)
+; RV32I-NEXT:    lui a1, %hi(gb)
+; RV32I-NEXT:    lw a2, %lo(ga)(a0)
+; RV32I-NEXT:    lw a3, %lo(gb)(a1)
 ; RV32I-NEXT:    lui a4, 1
 ; RV32I-NEXT:    addi a4, a4, -1096
-; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    sw a1, %lo(ga)(a0)
-; RV32I-NEXT:    sw a3, %lo(gb)(a2)
+; RV32I-NEXT:    sw a2, %lo(ga)(a0)
+; RV32I-NEXT:    sw a3, %lo(gb)(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add32_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(ga)
-; RV64I-NEXT:    lw a1, %lo(ga)(a0)
-; RV64I-NEXT:    lui a2, %hi(gb)
-; RV64I-NEXT:    lw a3, %lo(gb)(a2)
+; RV64I-NEXT:    lui a1, %hi(gb)
+; RV64I-NEXT:    lw a2, %lo(ga)(a0)
+; RV64I-NEXT:    lw a3, %lo(gb)(a1)
 ; RV64I-NEXT:    lui a4, 1
 ; RV64I-NEXT:    addi a4, a4, -1096
-; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a3, a3, a4
-; RV64I-NEXT:    sw a1, %lo(ga)(a0)
-; RV64I-NEXT:    sw a3, %lo(gb)(a2)
+; RV64I-NEXT:    sw a2, %lo(ga)(a0)
+; RV64I-NEXT:    sw a3, %lo(gb)(a1)
 ; RV64I-NEXT:    ret
   %1 = load i32, ptr @ga, align 4
   %2 = load i32, ptr @gb, align 4
diff --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll
index 3a4163a8bb50f9..ff0d1e75c746c5 100644
--- a/llvm/test/CodeGen/RISCV/addcarry.ll
+++ b/llvm/test/CodeGen/RISCV/addcarry.ll
@@ -12,16 +12,16 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32:       # %bb.0:
 ; RISCV32-NEXT:    mul a4, a0, a3
 ; RISCV32-NEXT:    mulhu a5, a0, a2
-; RISCV32-NEXT:    add a6, a5, a4
-; RISCV32-NEXT:    mul a4, a1, a2
-; RISCV32-NEXT:    add a4, a6, a4
-; RISCV32-NEXT:    sltu a7, a4, a6
-; RISCV32-NEXT:    sltu a5, a6, a5
-; RISCV32-NEXT:    mulhu a6, a0, a3
+; RISCV32-NEXT:    mul a6, a1, a2
+; RISCV32-NEXT:    mulhu a7, a0, a3
 ; RISCV32-NEXT:    mulhu t0, a1, a2
-; RISCV32-NEXT:    add a6, a6, t0
-; RISCV32-NEXT:    add a5, a6, a5
-; RISCV32-NEXT:    add a5, a5, a7
+; RISCV32-NEXT:    add t1, a5, a4
+; RISCV32-NEXT:    add a7, a7, t0
+; RISCV32-NEXT:    add a4, t1, a6
+; RISCV32-NEXT:    sltu a5, t1, a5
+; RISCV32-NEXT:    sltu a6, a4, t1
+; RISCV32-NEXT:    add a5, a7, a5
+; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    mul a6, a1, a3
 ; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    bgez a1, .LBB0_2
@@ -34,9 +34,9 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32-NEXT:  .LBB0_4:
 ; RISCV32-NEXT:    slli a5, a5, 30
 ; RISCV32-NEXT:    srli a1, a4, 2
-; RISCV32-NEXT:    or a1, a5, a1
 ; RISCV32-NEXT:    slli a4, a4, 30
 ; RISCV32-NEXT:    mul a0, a0, a2
+; RISCV32-NEXT:    or a1, a5, a1
 ; RISCV32-NEXT:    srli a0, a0, 2
 ; RISCV32-NEXT:    or a0, a4, a0
 ; RISCV32-NEXT:    ret
@@ -49,8 +49,8 @@ define { i32, i32, i1 } @addcarry_2x32(i32 %x0, i32 %x1, i32 %y0, i32 %y1) nounw
 ; RISCV32-LABEL: addcarry_2x32:
 ; RISCV32:       # %bb.0:
 ; RISCV32-NEXT:    add a3, a1, a3
-; RISCV32-NEXT:    sltu a1, a3, a1
 ; RISCV32-NEXT:    add a4, a2, a4
+; RISCV32-NEXT:    sltu a1, a3, a1
 ; RISCV32-NEXT:    sltu a2, a4, a2
 ; RISCV32-NEXT:    add a1, a4, a1
 ; RISCV32-NEXT:    sltu a4, a1, a4
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index a18526718461ef..8e445511b61195 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -53,16 +53,16 @@ define i64 @add_mul_combine_accept_a3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_accept_a3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
-; RV32IMB-NEXT:    addi a0, a2, 1073
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    addi a0, a3, 1073
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -132,18 +132,18 @@ define i64 @add_mul_combine_accept_b3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_accept_b3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 23
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh3add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh3add a2, a0, a0
+; RV32IMB-NEXT:    sh3add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 50
 ; RV32IMB-NEXT:    addi a0, a0, 1119
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -207,18 +207,18 @@ define i64 @add_mul_combine_reject_a3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_a3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -185
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -278,16 +278,16 @@ define i64 @add_mul_combine_reject_c3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_c3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 73
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh3add a3, a1, a1
 ; RV32IMB-NEXT:    sh3add a1, a3, a1
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh3add a2, a0, a0
-; RV32IMB-NEXT:    sh3add a2, a2, a0
+; RV32IMB-NEXT:    sh3add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh3add a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 18
 ; RV32IMB-NEXT:    addi a0, a0, -728
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -344,16 +344,16 @@ define i64 @add_mul_combine_reject_d3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_d3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 192
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a1, a1, a1
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh1add a0, a0, a0
 ; RV32IMB-NEXT:    slli a1, a1, 6
 ; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a0, a0, a0
-; RV32IMB-NEXT:    slli a2, a0, 6
-; RV32IMB-NEXT:    lui a0, 47
-; RV32IMB-NEXT:    addi a0, a0, -512
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    lui a2, 47
+; RV32IMB-NEXT:    slli a3, a0, 6
+; RV32IMB-NEXT:    addi a0, a2, -512
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -414,18 +414,18 @@ define i64 @add_mul_combine_reject_e3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_e3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -185
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -491,18 +491,18 @@ define i64 @add_mul_combine_reject_f3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_f3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -145
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -565,16 +565,16 @@ define i64 @add_mul_combine_reject_g3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_g3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 73
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh3add a3, a1, a1
 ; RV32IMB-NEXT:    sh3add a1, a3, a1
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh3add a2, a0, a0
-; RV32IMB-NEXT:    sh3add a2, a2, a0
+; RV32IMB-NEXT:    sh3add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh3add a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, -882
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -595,15 +595,15 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_infinite_loop:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 24
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a1, a1, a1
-; RV32IMB-NEXT:    sh3add a1, a1, a2
-; RV32IMB-NEXT:    sh1add a0, a0, a0
-; RV32IMB-NEXT:    slli a2, a0, 3
-; RV32IMB-NEXT:    li a3, 1
-; RV32IMB-NEXT:    slli a3, a3, 11
-; RV32IMB-NEXT:    sh3add a0, a0, a3
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a0, a0, a2
+; RV32IMB-NEXT:    li a2, 1
+; RV32IMB-NEXT:    sh3add a1, a1, a0
+; RV32IMB-NEXT:    slli a4, a3, 3
+; RV32IMB-NEXT:    slli a2, a2, 11
+; RV32IMB-NEXT:    sh3add a0, a3, a2
+; RV32IMB-NEXT:    sltu a2, a0, a4
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -672,10 +672,10 @@ define i64 @mul3000_add8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, -1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, 798
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
@@ -747,10 +747,10 @@ define i64 @mul3000_sub8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, -1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 1048574
 ; RV32IMB-NEXT:    addi a0, a0, -798
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
@@ -823,12 +823,12 @@ define i64 @mulneg3000_add8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, 1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    sub a3, a3, a0
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, 798
 ; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
@@ -899,12 +899,12 @@ define i64 @mulneg3000_sub8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, 1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    sub a3, a3, a0
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 1048574
 ; RV32IMB-NEXT:    addi a0, a0, -798
 ; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    addi a1, a1, -1
diff --git a/llvm/test/CodeGen/RISCV/alu16.ll b/llvm/test/CodeGen/RISCV/alu16.ll
index cb28ccdda0a54b..41f26526ef03ec 100644
--- a/llvm/test/CodeGen/RISCV/alu16.ll
+++ b/llvm/test/CodeGen/RISCV/alu16.ll
@@ -254,8 +254,8 @@ define i16 @slt(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: slt:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -263,8 +263,8 @@ define i16 @slt(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: slt:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slt a0, a0, a1
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/alu8.ll b/llvm/test/CodeGen/RISCV/alu8.ll
index d563525be7a356..6ae96e7c9deae4 100644
--- a/llvm/test/CodeGen/RISCV/alu8.ll
+++ b/llvm/test/CodeGen/RISCV/alu8.ll
@@ -252,8 +252,8 @@ define i8 @slt(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: slt:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -261,8 +261,8 @@ define i8 @slt(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: slt:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slt a0, a0, a1
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/and.ll b/llvm/test/CodeGen/RISCV/and.ll
index 79e3b954c50d8d..31c63c7f9b18f5 100644
--- a/llvm/test/CodeGen/RISCV/and.ll
+++ b/llvm/test/CodeGen/RISCV/and.ll
@@ -124,8 +124,8 @@ define i64 @and64_0x7ffffffffffff000(i64 %x) {
 ; RV32I-LABEL: and64_0x7ffffffffffff000:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1048575
-; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
index 234a956be809ed..741860db13957a 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
@@ -99,10 +99,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
 ; RV32IA-NEXT:    li a0, 255
-; RV32IA-NEXT:    sll a0, a0, a4
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a0, a0, a4
+; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    sll a2, a2, a4
 ; RV32IA-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -129,10 +129,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV32IA-ZACAS-NEXT:    li a0, 255
-; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
 ; RV32IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
+; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    sll a2, a2, a4
 ; RV32IA-ZACAS-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV32IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
@@ -159,10 +159,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    li a0, 255
-; RV64IA-NEXT:    sllw a0, a0, a4
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a0, a0, a4
+; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    sllw a2, a2, a4
 ; RV64IA-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
@@ -189,10 +189,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a0, 255
-; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    sllw a2, a2, a4
 ; RV64IA-ZACAS-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV64IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
@@ -240,10 +240,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
 ; RV32IA-NEXT:    li a0, 255
-; RV32IA-NEXT:    sll a0, a0, a4
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a0, a0, a4
+; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    sll a2, a2, a4
 ; RV32IA-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -273,10 +273,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV32IA-ZACAS-NEXT:    li a0, 255
-; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
 ; RV32IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
+; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    sll a2, a2, a4
 ; RV32IA-ZACAS-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV32IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
@@ -306,10 +306,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    li a0, 255
-; RV64IA-NEXT:    sllw a0, a0, a4
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a0, a0, a4
+; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    sllw a2, a2, a4
 ; RV64IA-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
@@ -339,10 +339,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a0, 255
-; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    sllw a2, a2, a4
 ; RV64IA-ZACAS-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV64IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index a87b49e61a8dbc..c3b972840377f8 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -43,10 +43,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -79,10 +79,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -102,10 +102,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a2, (a3)
@@ -130,10 +130,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -170,10 +170,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -193,10 +193,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -216,10 +216,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -239,10 +239,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -275,10 +275,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -298,10 +298,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -326,10 +326,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -349,10 +349,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -394,10 +394,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -417,10 +417,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -440,10 +440,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -463,10 +463,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -499,10 +499,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -522,10 +522,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -550,10 +550,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -573,10 +573,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -618,10 +618,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a2, (a3)
@@ -641,10 +641,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -664,10 +664,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -687,10 +687,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -723,10 +723,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -746,10 +746,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -774,10 +774,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -797,10 +797,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -842,10 +842,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -865,10 +865,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -888,10 +888,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -911,10 +911,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -947,10 +947,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -970,10 +970,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -998,10 +998,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1021,10 +1021,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1066,10 +1066,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1089,10 +1089,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1112,10 +1112,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1135,10 +1135,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1171,10 +1171,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1194,10 +1194,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1222,10 +1222,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1245,10 +1245,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1290,10 +1290,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1313,10 +1313,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1336,10 +1336,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1359,10 +1359,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1395,10 +1395,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1418,10 +1418,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1446,10 +1446,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1469,10 +1469,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1514,10 +1514,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -1550,10 +1550,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1573,10 +1573,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -1601,10 +1601,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1646,10 +1646,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -1682,10 +1682,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1705,10 +1705,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -1733,10 +1733,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1778,10 +1778,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -1814,10 +1814,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1837,10 +1837,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -1866,10 +1866,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1915,8 +1915,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -1952,8 +1952,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -1976,8 +1976,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2005,8 +2005,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2046,8 +2046,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2070,8 +2070,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2094,8 +2094,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2118,8 +2118,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2155,8 +2155,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2179,8 +2179,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2208,8 +2208,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2232,8 +2232,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2278,8 +2278,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2302,8 +2302,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2326,8 +2326,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2350,8 +2350,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2387,8 +2387,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2411,8 +2411,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2440,8 +2440,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2464,8 +2464,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2510,8 +2510,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a2, (a3)
@@ -2534,8 +2534,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2558,8 +2558,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2582,8 +2582,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2619,8 +2619,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -2643,8 +2643,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2672,8 +2672,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2696,8 +2696,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2742,8 +2742,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2766,8 +2766,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2790,8 +2790,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2814,8 +2814,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2851,8 +2851,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2875,8 +2875,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2904,8 +2904,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2928,8 +2928,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2974,8 +2974,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2998,8 +2998,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3022,8 +3022,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3046,8 +3046,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3083,8 +3083,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -3107,8 +3107,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3136,8 +3136,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3160,8 +3160,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3206,8 +3206,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -3230,8 +3230,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3254,8 +3254,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3278,8 +3278,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3315,8 +3315,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -3339,8 +3339,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3368,8 +3368,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3392,8 +3392,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3438,8 +3438,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -3475,8 +3475,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3499,8 +3499,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -3528,8 +3528,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3574,8 +3574,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -3611,8 +3611,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3635,8 +3635,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -3664,8 +3664,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3710,8 +3710,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -3747,8 +3747,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3771,8 +3771,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -3801,8 +3801,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index bc3eac7b556d76..81518541477a83 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -46,8 +46,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -76,8 +76,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -96,8 +96,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -140,8 +140,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -160,8 +160,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -190,8 +190,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -210,8 +210,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -230,8 +230,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -250,8 +250,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -294,8 +294,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -314,8 +314,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -344,8 +344,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -364,8 +364,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -384,8 +384,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -404,8 +404,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -448,8 +448,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -468,8 +468,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -498,8 +498,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -518,8 +518,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -538,8 +538,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -558,8 +558,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -602,8 +602,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -632,8 +632,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -652,8 +652,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -1636,8 +1636,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -1666,8 +1666,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1686,8 +1686,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -1730,8 +1730,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -1750,8 +1750,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -1780,8 +1780,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -1800,8 +1800,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1820,8 +1820,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -1840,8 +1840,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -1884,8 +1884,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -1904,8 +1904,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -1934,8 +1934,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1954,8 +1954,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1974,8 +1974,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -1994,8 +1994,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2038,8 +2038,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -2058,8 +2058,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2088,8 +2088,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2108,8 +2108,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2128,8 +2128,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2148,8 +2148,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2192,8 +2192,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -2222,8 +2222,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2242,8 +2242,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2286,8 +2286,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -2316,8 +2316,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2336,8 +2336,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2382,8 +2382,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -2402,8 +2402,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2432,8 +2432,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2452,8 +2452,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2472,8 +2472,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2492,8 +2492,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2538,8 +2538,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -2558,8 +2558,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2588,8 +2588,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2608,8 +2608,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2628,8 +2628,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2648,8 +2648,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2694,8 +2694,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -2714,8 +2714,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2744,8 +2744,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2764,8 +2764,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2784,8 +2784,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2804,8 +2804,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2850,8 +2850,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -2880,8 +2880,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2900,8 +2900,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2946,9 +2946,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
+; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    not a3, a3
-; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
@@ -2970,9 +2970,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
+; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    not a3, a3
-; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -2984,9 +2984,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
+; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    not a3, a3
-; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3022,9 +3022,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aq a1, a1, (a2)
@@ -3036,9 +3036,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3060,9 +3060,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
@@ -3074,9 +3074,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3088,9 +3088,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
@@ -3102,9 +3102,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3140,9 +3140,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.rl a1, a1, (a2)
@@ -3154,9 +3154,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3178,9 +3178,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
@@ -3192,9 +3192,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3206,9 +3206,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
@@ -3220,9 +3220,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3258,9 +3258,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3272,9 +3272,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3296,9 +3296,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3310,9 +3310,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3324,9 +3324,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3338,9 +3338,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3376,9 +3376,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3390,9 +3390,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3414,9 +3414,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3428,9 +3428,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3442,9 +3442,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3456,9 +3456,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3494,8 +3494,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -3525,8 +3525,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3546,8 +3546,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -3567,8 +3567,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3588,8 +3588,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3653,8 +3653,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -3674,8 +3674,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -3705,8 +3705,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -3726,8 +3726,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3747,8 +3747,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -3768,8 +3768,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -3789,8 +3789,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -3810,8 +3810,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3875,8 +3875,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -3896,8 +3896,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -3927,8 +3927,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3948,8 +3948,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3969,8 +3969,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -3990,8 +3990,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -4011,8 +4011,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4032,8 +4032,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4097,8 +4097,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -4118,8 +4118,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -4149,8 +4149,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -4170,8 +4170,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4191,8 +4191,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -4212,8 +4212,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -4233,8 +4233,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -4254,8 +4254,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4319,8 +4319,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -4350,8 +4350,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4371,8 +4371,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4392,8 +4392,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4413,8 +4413,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4436,8 +4436,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:  .LBB34_1: # %atomicrmw.start
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    slli a4, a0, 56
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    amocas.b.aqrl a0, a3, (a2)
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    srai a4, a4, 56
@@ -4452,8 +4452,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:  .LBB34_1: # %atomicrmw.start
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    slli a4, a0, 56
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    amocas.b a0, a3, (a2)
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    srai a4, a4, 56
@@ -5391,23 +5391,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB45_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -5462,23 +5462,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB45_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5491,23 +5491,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB45_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5576,23 +5576,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB46_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
@@ -5605,23 +5605,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB46_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -5676,23 +5676,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5705,23 +5705,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5734,23 +5734,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5763,23 +5763,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5848,23 +5848,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB47_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -5877,23 +5877,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB47_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -5948,23 +5948,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -5977,23 +5977,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6006,23 +6006,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6035,23 +6035,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6120,23 +6120,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB48_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6149,23 +6149,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB48_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -6220,23 +6220,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6249,23 +6249,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6278,23 +6278,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6307,23 +6307,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6392,23 +6392,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB49_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6463,23 +6463,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB49_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6492,23 +6492,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB49_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6577,23 +6577,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB50_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -6648,23 +6648,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB50_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6677,23 +6677,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB50_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6762,23 +6762,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB51_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
@@ -6791,23 +6791,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB51_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -6862,23 +6862,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6891,23 +6891,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6920,23 +6920,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6949,23 +6949,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7034,23 +7034,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB52_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7063,23 +7063,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB52_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -7134,23 +7134,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7163,23 +7163,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7192,23 +7192,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7221,23 +7221,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7306,23 +7306,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB53_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7335,23 +7335,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB53_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -7406,23 +7406,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7435,23 +7435,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7464,23 +7464,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7493,23 +7493,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7578,23 +7578,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB54_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7649,23 +7649,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB54_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7678,23 +7678,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB54_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7762,8 +7762,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -7826,8 +7826,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -7850,8 +7850,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -7928,8 +7928,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -7952,8 +7952,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -8016,8 +8016,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8040,8 +8040,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8064,8 +8064,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8088,8 +8088,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8166,8 +8166,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -8190,8 +8190,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -8254,8 +8254,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8278,8 +8278,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8302,8 +8302,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8326,8 +8326,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8404,8 +8404,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -8428,8 +8428,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -8492,8 +8492,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8516,8 +8516,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8540,8 +8540,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8564,8 +8564,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8642,8 +8642,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -8706,8 +8706,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -8730,8 +8730,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -8808,8 +8808,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -8872,8 +8872,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8896,8 +8896,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8974,8 +8974,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -8998,8 +8998,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -9062,8 +9062,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9086,8 +9086,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9110,8 +9110,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9134,8 +9134,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9212,8 +9212,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -9236,8 +9236,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -9300,8 +9300,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9324,8 +9324,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9348,8 +9348,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9372,8 +9372,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9450,8 +9450,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -9474,8 +9474,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -9538,8 +9538,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9562,8 +9562,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9586,8 +9586,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9610,8 +9610,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9688,8 +9688,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -9752,8 +9752,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -9776,8 +9776,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -12862,10 +12862,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    sll a4, a3, a0
-; RV32IA-NEXT:    not a4, a4
 ; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    not a3, a4
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    or a1, a1, a4
+; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-NEXT:    srl a0, a1, a0
 ; RV32IA-NEXT:    ret
@@ -12887,10 +12887,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-NOZACAS-NEXT:    ret
@@ -12902,10 +12902,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-ZACAS-NEXT:    not a4, a4
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-ZACAS-NEXT:    not a3, a4
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-ZACAS-NEXT:    ret
@@ -12941,10 +12941,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aq a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -12956,10 +12956,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -12981,10 +12981,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -12996,10 +12996,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13011,10 +13011,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13026,10 +13026,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -13065,10 +13065,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.rl a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -13080,10 +13080,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -13105,10 +13105,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -13120,10 +13120,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13135,10 +13135,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13150,10 +13150,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -13189,10 +13189,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -13204,10 +13204,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -13229,10 +13229,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -13244,10 +13244,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13259,10 +13259,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13274,10 +13274,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -13313,10 +13313,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -13328,10 +13328,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -13353,10 +13353,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -13368,10 +13368,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13383,10 +13383,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13398,10 +13398,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -14411,8 +14411,8 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:  .LBB99_1: # %atomicrmw.start
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    slli a4, a0, 48
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    amocas.h.aqrl a0, a3, (a2)
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    srai a4, a4, 48
@@ -14427,8 +14427,8 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:  .LBB99_1: # %atomicrmw.start
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    slli a4, a0, 48
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    amocas.h a0, a3, (a2)
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    srai a4, a4, 48
@@ -15420,31 +15420,31 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB110_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB110_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_monotonic:
@@ -15493,62 +15493,62 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB110_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB110_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_monotonic:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB110_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB110_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_monotonic:
@@ -15611,62 +15611,62 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB111_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB111_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_max_i16_acquire:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB111_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB111_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_acquire:
@@ -15715,124 +15715,124 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acquire:
@@ -15895,62 +15895,62 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB112_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB112_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_max_i16_release:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB112_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB112_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_release:
@@ -15999,124 +15999,124 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_release:
@@ -16179,62 +16179,62 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB113_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB113_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB113_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB113_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_acq_rel:
@@ -16283,124 +16283,124 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acq_rel:
@@ -16463,31 +16463,31 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB114_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB114_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_seq_cst:
@@ -16536,62 +16536,62 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB114_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB114_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_seq_cst:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB114_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB114_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_seq_cst:
@@ -16654,31 +16654,31 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB115_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB115_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_monotonic:
@@ -16727,62 +16727,62 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB115_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB115_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_monotonic:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB115_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB115_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_monotonic:
@@ -16845,62 +16845,62 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB116_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB116_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_min_i16_acquire:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB116_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB116_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_acquire:
@@ -16949,124 +16949,124 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acquire:
@@ -17129,62 +17129,62 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB117_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB117_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_min_i16_release:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB117_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB117_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_release:
@@ -17233,124 +17233,124 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_release:
@@ -17413,62 +17413,62 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB118_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB118_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB118_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB118_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_acq_rel:
@@ -17517,124 +17517,124 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acq_rel:
@@ -17697,31 +17697,31 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB119_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB119_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_seq_cst:
@@ -17770,62 +17770,62 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB119_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB119_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_seq_cst:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB119_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB119_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_seq_cst:
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index ae7103c609b5bc..aea7473ceece4f 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -142,8 +142,8 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -176,8 +176,8 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -214,8 +214,8 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -248,8 +248,8 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -286,8 +286,8 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -320,8 +320,8 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -358,9 +358,9 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
+; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    not a3, a3
-; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
@@ -386,9 +386,9 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
+; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
-; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    or a1, a1, a3
 ; RV64IA-NEXT:    amoand.w a1, a1, (a2)
@@ -418,8 +418,8 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -453,8 +453,8 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -626,23 +626,23 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB10_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB10_3: # in Loop: Header=BB10_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -700,23 +700,23 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    li a3, 255
 ; RV64IA-NEXT:    slli a1, a1, 56
+; RV64IA-NEXT:    andi a4, a0, 24
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    srai a1, a1, 56
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    xori a3, a3, 56
+; RV64IA-NEXT:    xori a4, a4, 56
 ; RV64IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a5, (a2)
-; RV64IA-NEXT:    and a7, a5, a4
+; RV64IA-NEXT:    and a7, a5, a3
 ; RV64IA-NEXT:    mv a6, a5
-; RV64IA-NEXT:    sll a7, a7, a3
-; RV64IA-NEXT:    sra a7, a7, a3
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a7, a1, .LBB10_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
 ; RV64IA-NEXT:    xor a6, a5, a1
-; RV64IA-NEXT:    and a6, a6, a4
+; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB10_3: # in Loop: Header=BB10_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
@@ -778,23 +778,23 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB11_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB11_3: # in Loop: Header=BB11_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -852,23 +852,23 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    li a3, 255
 ; RV64IA-NEXT:    slli a1, a1, 56
+; RV64IA-NEXT:    andi a4, a0, 24
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    srai a1, a1, 56
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    xori a3, a3, 56
+; RV64IA-NEXT:    xori a4, a4, 56
 ; RV64IA-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a5, (a2)
-; RV64IA-NEXT:    and a7, a5, a4
+; RV64IA-NEXT:    and a7, a5, a3
 ; RV64IA-NEXT:    mv a6, a5
-; RV64IA-NEXT:    sll a7, a7, a3
-; RV64IA-NEXT:    sra a7, a7, a3
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a1, a7, .LBB11_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
 ; RV64IA-NEXT:    xor a6, a5, a1
-; RV64IA-NEXT:    and a6, a6, a4
+; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB11_3: # in Loop: Header=BB11_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
@@ -929,8 +929,8 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -996,8 +996,8 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -1067,8 +1067,8 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -1134,8 +1134,8 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -1400,10 +1400,10 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    sll a4, a3, a0
-; RV32IA-NEXT:    not a4, a4
 ; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    not a3, a4
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    or a1, a1, a4
+; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-NEXT:    srl a0, a1, a0
 ; RV32IA-NEXT:    slli a0, a0, 16
@@ -1429,10 +1429,10 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addi a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
-; RV64IA-NEXT:    not a4, a4
 ; RV64IA-NEXT:    and a1, a1, a3
+; RV64IA-NEXT:    not a3, a4
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    or a1, a1, a4
+; RV64IA-NEXT:    or a1, a1, a3
 ; RV64IA-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-NEXT:    srlw a0, a1, a0
 ; RV64IA-NEXT:    slli a0, a0, 48
@@ -1674,31 +1674,31 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB21_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB21_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    slli a0, a0, 16
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
@@ -1750,31 +1750,31 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    lui a4, 16
-; RV64IA-NEXT:    addi a4, a4, -1
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    slli a1, a1, 48
+; RV64IA-NEXT:    li a4, 48
+; RV64IA-NEXT:    andi a5, a0, 24
+; RV64IA-NEXT:    addi a3, a3, -1
 ; RV64IA-NEXT:    srai a1, a1, 48
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    li a5, 48
-; RV64IA-NEXT:    sub a5, a5, a3
+; RV64IA-NEXT:    sub a4, a4, a5
 ; RV64IA-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT:    lr.w a3, (a2)
-; RV64IA-NEXT:    and a7, a3, a4
-; RV64IA-NEXT:    mv a6, a3
-; RV64IA-NEXT:    sll a7, a7, a5
-; RV64IA-NEXT:    sra a7, a7, a5
+; RV64IA-NEXT:    lr.w a5, (a2)
+; RV64IA-NEXT:    and a7, a5, a3
+; RV64IA-NEXT:    mv a6, a5
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a7, a1, .LBB21_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
-; RV64IA-NEXT:    xor a6, a3, a1
-; RV64IA-NEXT:    and a6, a6, a4
-; RV64IA-NEXT:    xor a6, a3, a6
+; RV64IA-NEXT:    xor a6, a5, a1
+; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NEXT:    bnez a6, .LBB21_1
 ; RV64IA-NEXT:  # %bb.4:
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
@@ -1830,31 +1830,31 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB22_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB22_3: # in Loop: Header=BB22_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB22_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    slli a0, a0, 16
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
@@ -1906,31 +1906,31 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    lui a4, 16
-; RV64IA-NEXT:    addi a4, a4, -1
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    slli a1, a1, 48
+; RV64IA-NEXT:    li a4, 48
+; RV64IA-NEXT:    andi a5, a0, 24
+; RV64IA-NEXT:    addi a3, a3, -1
 ; RV64IA-NEXT:    srai a1, a1, 48
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    li a5, 48
-; RV64IA-NEXT:    sub a5, a5, a3
+; RV64IA-NEXT:    sub a4, a4, a5
 ; RV64IA-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT:    lr.w a3, (a2)
-; RV64IA-NEXT:    and a7, a3, a4
-; RV64IA-NEXT:    mv a6, a3
-; RV64IA-NEXT:    sll a7, a7, a5
-; RV64IA-NEXT:    sra a7, a7, a5
+; RV64IA-NEXT:    lr.w a5, (a2)
+; RV64IA-NEXT:    and a7, a5, a3
+; RV64IA-NEXT:    mv a6, a5
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a1, a7, .LBB22_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
-; RV64IA-NEXT:    xor a6, a3, a1
-; RV64IA-NEXT:    and a6, a6, a4
-; RV64IA-NEXT:    xor a6, a3, a6
+; RV64IA-NEXT:    xor a6, a5, a1
+; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB22_3: # in Loop: Header=BB22_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NEXT:    bnez a6, .LBB22_1
 ; RV64IA-NEXT:  # %bb.4:
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
@@ -3806,10 +3806,10 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a2, a2, a0
 ; RV32IA-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a3)
@@ -3846,10 +3846,10 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a2, a2, a0
 ; RV64IA-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a5, (a3)
@@ -3890,10 +3890,10 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -3929,10 +3929,10 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a0, a2, a0
 ; RV64IA-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a2, (a3)
@@ -3977,8 +3977,8 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a2, a2, a0
 ; RV32IA-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a3)
@@ -4018,8 +4018,8 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
 ; RV64IA-NEXT:    addi a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
 ; RV64IA-NEXT:    and a1, a1, a4
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    and a2, a2, a4
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a2, a2, a0
 ; RV64IA-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a3)
@@ -4063,8 +4063,8 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -4103,8 +4103,8 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
 ; RV64IA-NEXT:    addi a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
 ; RV64IA-NEXT:    and a1, a1, a4
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    and a2, a2, a4
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a0, a2, a0
 ; RV64IA-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a2, (a3)
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index 038ddd427b0319..34b29ea1dc6c2b 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -63,8 +63,8 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a3, a0, 3
-; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
@@ -146,8 +146,8 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    lw a3, 0(a2)
 ; RV64IA-NEXT:    sllw a4, a5, a4
 ; RV64IA-NEXT:    not a4, a4
@@ -239,8 +239,8 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -329,8 +329,8 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
@@ -520,42 +520,42 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s2, -16
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s0
+; RV32I-NEXT:    sltu a0, a5, s0
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a1, a0, s2
-; RV32I-NEXT:    sltu a2, a5, a1
 ; RV32I-NEXT:    and a0, a0, s0
-; RV32I-NEXT:    sub a3, a4, a0
-; RV32I-NEXT:    sub a3, a3, a2
-; RV32I-NEXT:    sub a2, a5, a1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sltu a3, a4, a1
+; RV32I-NEXT:    sub a0, a5, a0
+; RV32I-NEXT:    sub a2, a4, a1
+; RV32I-NEXT:    sub a3, a0, a3
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a4, s0, .LBB3_1
+; RV32I-NEXT:    bne a5, s0, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s2
+; RV32I-NEXT:    sltu a0, a4, s2
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -582,42 +582,42 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s2, -16
 ; RV32IA-NEXT:    mv s0, a2
 ; RV32IA-NEXT:    mv s1, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s0
+; RV32IA-NEXT:    sltu a0, a5, s0
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32IA-NEXT:    xori a0, a0, 1
 ; RV32IA-NEXT:    neg a0, a0
 ; RV32IA-NEXT:    and a1, a0, s2
-; RV32IA-NEXT:    sltu a2, a5, a1
 ; RV32IA-NEXT:    and a0, a0, s0
-; RV32IA-NEXT:    sub a3, a4, a0
-; RV32IA-NEXT:    sub a3, a3, a2
-; RV32IA-NEXT:    sub a2, a5, a1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sltu a3, a4, a1
+; RV32IA-NEXT:    sub a0, a5, a0
+; RV32IA-NEXT:    sub a2, a4, a1
+; RV32IA-NEXT:    sub a3, a0, a3
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
 ; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a4, s0, .LBB3_1
+; RV32IA-NEXT:    bne a5, s0, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s2
+; RV32IA-NEXT:    sltu a0, a4, s2
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -906,8 +906,8 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -990,8 +990,8 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index de85c8ca17c15e..3ff01e4987bd5c 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -224,8 +224,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -239,8 +239,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    addi a5, a5, 1
 ; RV32IA-NEXT:    sltu a7, a7, a1
-; RV32IA-NEXT:    neg a7, a7
 ; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    neg a7, a7
 ; RV32IA-NEXT:    and a5, a7, a5
 ; RV32IA-NEXT:    sll a5, a5, a0
 ; RV32IA-NEXT:    and a7, a6, a4
@@ -309,8 +309,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
@@ -324,8 +324,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    and t0, a6, a3
 ; RV64IA-NEXT:    addi a6, a6, 1
 ; RV64IA-NEXT:    sltu t0, t0, a1
-; RV64IA-NEXT:    negw t0, t0
 ; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    negw t0, t0
 ; RV64IA-NEXT:    and a6, t0, a6
 ; RV64IA-NEXT:    sllw a6, a6, a0
 ; RV64IA-NEXT:    and a4, a4, a5
@@ -505,10 +505,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32I-NEXT:    addi a1, a4, 1
-; RV32I-NEXT:    seqz a2, a1
-; RV32I-NEXT:    add a3, a5, a2
 ; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    seqz a3, a1
 ; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    add a3, a5, a3
 ; RV32I-NEXT:    and a3, a0, a3
 ; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a5, 12(sp)
@@ -565,10 +565,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32IA-NEXT:    addi a1, a4, 1
-; RV32IA-NEXT:    seqz a2, a1
-; RV32IA-NEXT:    add a3, a5, a2
 ; RV32IA-NEXT:    neg a0, a0
+; RV32IA-NEXT:    seqz a3, a1
 ; RV32IA-NEXT:    and a2, a0, a1
+; RV32IA-NEXT:    add a3, a5, a3
 ; RV32IA-NEXT:    and a3, a0, a3
 ; RV32IA-NEXT:    sw a4, 8(sp)
 ; RV32IA-NEXT:    sw a5, 12(sp)
@@ -726,8 +726,8 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a3, a0, 3
-; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
@@ -827,8 +827,8 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    lw a3, 0(a2)
 ; RV64IA-NEXT:    sllw a4, a5, a4
 ; RV64IA-NEXT:    not a4, a4
@@ -938,8 +938,8 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a7, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -1046,8 +1046,8 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
diff --git a/llvm/test/CodeGen/RISCV/avgceils.ll b/llvm/test/CodeGen/RISCV/avgceils.ll
index 2ff4ad3b3b4624..64410fad6029aa 100644
--- a/llvm/test/CodeGen/RISCV/avgceils.ll
+++ b/llvm/test/CodeGen/RISCV/avgceils.ll
@@ -12,8 +12,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -23,8 +23,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -41,8 +41,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -52,8 +52,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -72,8 +72,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -83,8 +83,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -101,8 +101,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -112,8 +112,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -183,13 +183,13 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srai a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srai a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3
@@ -214,13 +214,13 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srai a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srai a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/avgceilu.ll b/llvm/test/CodeGen/RISCV/avgceilu.ll
index cc12b585036abb..924a50a836ddaf 100644
--- a/llvm/test/CodeGen/RISCV/avgceilu.ll
+++ b/llvm/test/CodeGen/RISCV/avgceilu.ll
@@ -132,8 +132,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -158,8 +158,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -179,13 +179,13 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3
@@ -210,13 +210,13 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/avgfloors.ll b/llvm/test/CodeGen/RISCV/avgfloors.ll
index b36177de021d1b..b321f4c2f29395 100644
--- a/llvm/test/CodeGen/RISCV/avgfloors.ll
+++ b/llvm/test/CodeGen/RISCV/avgfloors.ll
@@ -12,8 +12,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -22,8 +22,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -39,8 +39,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -49,8 +49,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -67,8 +67,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -77,8 +77,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -94,8 +94,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -104,8 +104,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -172,8 +172,8 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    srai a3, a1, 1
 ; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a4, a0, a2
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a4, a4, 1
 ; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    and a2, a0, a2
@@ -203,8 +203,8 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    srai a3, a1, 1
 ; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a4, a0, a2
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a4, a4, 1
 ; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    and a2, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll
index fa88c3760e455d..550cc3136bbc33 100644
--- a/llvm/test/CodeGen/RISCV/avgflooru.ll
+++ b/llvm/test/CodeGen/RISCV/avgflooru.ll
@@ -122,8 +122,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -147,8 +147,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -164,20 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a2, a4, a1
-; RV32I-NEXT:    beq a2, a3, .LBB6_2
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    beq a1, a3, .LBB6_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:    sltu a2, a1, a3
 ; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    srli a3, a2, 1
-; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a3, a1, 1
+; RV32I-NEXT:    slli a4, a1, 31
 ; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a3, a2
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_fixed_i64:
@@ -197,20 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a2, a4, a1
-; RV32I-NEXT:    beq a2, a3, .LBB7_2
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    beq a1, a3, .LBB7_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:    sltu a2, a1, a3
 ; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    srli a3, a2, 1
-; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a3, a1, 1
+; RV32I-NEXT:    slli a4, a1, 31
 ; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a3, a2
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_ext_i64:
diff --git a/llvm/test/CodeGen/RISCV/bf16-promote.ll b/llvm/test/CodeGen/RISCV/bf16-promote.ll
index 296d94f4a4cd46..08c053fab4f676 100644
--- a/llvm/test/CodeGen/RISCV/bf16-promote.ll
+++ b/llvm/test/CodeGen/RISCV/bf16-promote.ll
@@ -114,8 +114,8 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; RV64-NEXT:    lhu a0, 0(a1)
 ; RV64-NEXT:    lhu a1, 0(s0)
 ; RV64-NEXT:    slli a0, a0, 16
-; RV64-NEXT:    fmv.w.x fa5, a0
 ; RV64-NEXT:    slli a1, a1, 16
+; RV64-NEXT:    fmv.w.x fa5, a0
 ; RV64-NEXT:    fmv.w.x fa4, a1
 ; RV64-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64-NEXT:    call __truncsfbf2
@@ -135,8 +135,8 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; RV32-NEXT:    lhu a0, 0(a1)
 ; RV32-NEXT:    lhu a1, 0(s0)
 ; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    fmv.w.x fa5, a0
 ; RV32-NEXT:    slli a1, a1, 16
+; RV32-NEXT:    fmv.w.x fa5, a0
 ; RV32-NEXT:    fmv.w.x fa4, a1
 ; RV32-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32-NEXT:    call __truncsfbf2
diff --git a/llvm/test/CodeGen/RISCV/bfloat-arith.ll b/llvm/test/CodeGen/RISCV/bfloat-arith.ll
index b688af4234e65a..871b43e61df508 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-arith.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-arith.ll
@@ -102,11 +102,11 @@ define i32 @fneg_bf16(bfloat %a, bfloat %b) nounwind {
 ; CHECK-LABEL: fneg_bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT:    lui a0, 1048568
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa4, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
@@ -124,12 +124,12 @@ define bfloat @fsgnjn_bf16(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN:       # %bb.0:
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
+; RV32IZFBFMIN-NEXT:    lui a0, 1048568
 ; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV32IZFBFMIN-NEXT:    fmv.x.h a0, fa5
-; RV32IZFBFMIN-NEXT:    not a0, a0
-; RV32IZFBFMIN-NEXT:    lui a1, 1048568
-; RV32IZFBFMIN-NEXT:    and a0, a0, a1
+; RV32IZFBFMIN-NEXT:    fmv.x.h a1, fa5
+; RV32IZFBFMIN-NEXT:    not a1, a1
+; RV32IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV32IZFBFMIN-NEXT:    fmv.x.h a1, fa0
 ; RV32IZFBFMIN-NEXT:    slli a1, a1, 17
 ; RV32IZFBFMIN-NEXT:    srli a1, a1, 17
@@ -141,12 +141,12 @@ define bfloat @fsgnjn_bf16(bfloat %a, bfloat %b) nounwind {
 ; RV64IZFBFMIN:       # %bb.0:
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
+; RV64IZFBFMIN-NEXT:    lui a0, 1048568
 ; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV64IZFBFMIN-NEXT:    fmv.x.h a0, fa5
-; RV64IZFBFMIN-NEXT:    not a0, a0
-; RV64IZFBFMIN-NEXT:    lui a1, 1048568
-; RV64IZFBFMIN-NEXT:    and a0, a0, a1
+; RV64IZFBFMIN-NEXT:    fmv.x.h a1, fa5
+; RV64IZFBFMIN-NEXT:    not a1, a1
+; RV64IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV64IZFBFMIN-NEXT:    fmv.x.h a1, fa0
 ; RV64IZFBFMIN-NEXT:    slli a1, a1, 49
 ; RV64IZFBFMIN-NEXT:    srli a1, a1, 49
@@ -247,16 +247,16 @@ define bfloat @fmsub_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
+; CHECK-NEXT:    fmadd.s fa5, fa4, fa3, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %c_ = fadd bfloat 0.0, %c ; avoid negation using xor
@@ -270,17 +270,17 @@ define bfloat @fnmadd_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
-; CHECK-NEXT:    fmv.h.x fa5, a0
-; CHECK-NEXT:    fmv.x.h a0, fa4
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    fmv.x.h a2, fa4
+; CHECK-NEXT:    xor a1, a1, a0
+; CHECK-NEXT:    xor a0, a2, a0
+; CHECK-NEXT:    fmv.h.x fa5, a1
 ; CHECK-NEXT:    fmv.h.x fa4, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
@@ -301,17 +301,17 @@ define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
-; CHECK-NEXT:    fmv.h.x fa5, a0
-; CHECK-NEXT:    fmv.x.h a0, fa4
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    fmv.x.h a2, fa4
+; CHECK-NEXT:    xor a1, a1, a0
+; CHECK-NEXT:    xor a0, a2, a0
+; CHECK-NEXT:    fmv.h.x fa5, a1
 ; CHECK-NEXT:    fmv.h.x fa4, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
@@ -369,16 +369,16 @@ define bfloat @fnmsub_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
+; CHECK-NEXT:    fmadd.s fa5, fa5, fa4, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %a_ = fadd bfloat 0.0, %a
@@ -392,16 +392,16 @@ define bfloat @fnmsub_bf16_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
+; CHECK-NEXT:    fmadd.s fa5, fa4, fa5, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %b_ = fadd bfloat 0.0, %b
@@ -432,11 +432,11 @@ define bfloat @fmsub_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
+; CHECK-NEXT:    fcvt.s.bf16 fa2, fa0
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-NEXT:    fmul.s fa4, fa2, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmul.s fa4, fa3, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
@@ -454,21 +454,21 @@ define bfloat @fnmadd_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fadd.s fa3, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa3, fa3
 ; CHECK-NEXT:    fcvt.s.bf16 fa2, fa2
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-NEXT:    fadd.s fa3, fa3, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa2, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
+; CHECK-NEXT:    fcvt.bf16.s fa3, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa3
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
 ; CHECK-NEXT:    fmul.s fa5, fa5, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
@@ -489,10 +489,10 @@ define bfloat @fnmsub_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
diff --git a/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
index 243c7d463661a9..51ea8873d8c031 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
@@ -292,9 +292,9 @@ define void @br_fcmp_ord(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN-LABEL: br_fcmp_ord:
 ; RV32IZFBFMIN:       # %bb.0:
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV32IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV32IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV32IZFBFMIN-NEXT:    bnez a0, .LBB8_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %if.else
@@ -307,9 +307,9 @@ define void @br_fcmp_ord(bfloat %a, bfloat %b) nounwind {
 ; RV64IZFBFMIN-LABEL: br_fcmp_ord:
 ; RV64IZFBFMIN:       # %bb.0:
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV64IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV64IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV64IZFBFMIN-NEXT:    bnez a0, .LBB8_2
 ; RV64IZFBFMIN-NEXT:  # %bb.1: # %if.else
@@ -545,9 +545,9 @@ define void @br_fcmp_uno(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN-LABEL: br_fcmp_uno:
 ; RV32IZFBFMIN:       # %bb.0:
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV32IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV32IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV32IZFBFMIN-NEXT:    beqz a0, .LBB15_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %if.else
@@ -560,9 +560,9 @@ define void @br_fcmp_uno(bfloat %a, bfloat %b) nounwind {
 ; RV64IZFBFMIN-LABEL: br_fcmp_uno:
 ; RV64IZFBFMIN:       # %bb.0:
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV64IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV64IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV64IZFBFMIN-NEXT:    beqz a0, .LBB15_2
 ; RV64IZFBFMIN-NEXT:  # %bb.1: # %if.else
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index c09acf5efb4ab2..82359769c7c22f 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -51,13 +51,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-LABEL: fcvt_si_bf16_sat:
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK32ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK32ZFBFMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK32ZFBFMIN-NEXT:    lui a1, 815104
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK32ZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK32ZFBFMIN-NEXT:    lui a0, 815104
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK32ZFBFMIN-NEXT:    neg a0, a0
+; CHECK32ZFBFMIN-NEXT:    neg a0, a1
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32ZFBFMIN-NEXT:    and a0, a0, a1
@@ -66,14 +66,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-LABEL: fcvt_si_bf16_sat:
 ; RV32ID:       # %bb.0: # %start
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
-; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    feq.s a0, fa5, fa5
+; RV32ID-NEXT:    lui a1, 815104
+; RV32ID-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32ID-NEXT:    slli a0, a0, 16
 ; RV32ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32ID-NEXT:    lui a1, 815104
-; RV32ID-NEXT:    fmv.w.x fa3, a1
-; RV32ID-NEXT:    fmax.s fa5, fa5, fa3
+; RV32ID-NEXT:    fmv.w.x fa3, a0
+; RV32ID-NEXT:    feq.s a0, fa3, fa3
+; RV32ID-NEXT:    fmax.s fa5, fa3, fa5
 ; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -83,13 +83,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN-LABEL: fcvt_si_bf16_sat:
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK64ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK64ZFBFMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK64ZFBFMIN-NEXT:    lui a1, 815104
-; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK64ZFBFMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK64ZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK64ZFBFMIN-NEXT:    lui a0, 815104
+; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK64ZFBFMIN-NEXT:    neg a0, a0
+; CHECK64ZFBFMIN-NEXT:    neg a0, a1
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64ZFBFMIN-NEXT:    and a0, a0, a1
@@ -98,14 +98,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_si_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    feq.s a0, fa5, fa5
+; RV64ID-NEXT:    lui a1, 815104
+; RV64ID-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64ID-NEXT:    lui a1, 815104
-; RV64ID-NEXT:    fmv.w.x fa3, a1
-; RV64ID-NEXT:    fmax.s fa5, fa5, fa3
+; RV64ID-NEXT:    fmv.w.x fa3, a0
+; RV64ID-NEXT:    feq.s a0, fa3, fa3
+; RV64ID-NEXT:    fmax.s fa5, fa3, fa5
 ; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -466,7 +466,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32IZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFBFMIN-NEXT:    call __fixsfdi
-; RV32IZFBFMIN-NEXT:    lui a4, 524288
+; RV32IZFBFMIN-NEXT:    lui a3, 524288
 ; RV32IZFBFMIN-NEXT:    lui a2, 524288
 ; RV32IZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %start
@@ -474,19 +474,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32IZFBFMIN-NEXT:  .LBB10_2: # %start
 ; RV32IZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IZFBFMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFBFMIN-NEXT:    beqz a3, .LBB10_4
+; RV32IZFBFMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFBFMIN-NEXT:    beqz a1, .LBB10_4
 ; RV32IZFBFMIN-NEXT:  # %bb.3:
-; RV32IZFBFMIN-NEXT:    addi a2, a4, -1
+; RV32IZFBFMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFBFMIN-NEXT:  .LBB10_4: # %start
-; RV32IZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFBFMIN-NEXT:    feq.s a3, fs0, fs0
 ; RV32IZFBFMIN-NEXT:    neg a4, a1
-; RV32IZFBFMIN-NEXT:    and a1, a4, a2
-; RV32IZFBFMIN-NEXT:    neg a2, a3
-; RV32IZFBFMIN-NEXT:    neg a3, s0
+; RV32IZFBFMIN-NEXT:    neg a1, s0
+; RV32IZFBFMIN-NEXT:    neg a3, a3
+; RV32IZFBFMIN-NEXT:    and a0, a1, a0
+; RV32IZFBFMIN-NEXT:    and a1, a3, a2
+; RV32IZFBFMIN-NEXT:    or a0, a4, a0
 ; RV32IZFBFMIN-NEXT:    and a0, a3, a0
-; RV32IZFBFMIN-NEXT:    or a0, a2, a0
-; RV32IZFBFMIN-NEXT:    and a0, a4, a0
 ; RV32IZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFBFMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -505,7 +505,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; R32IDZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; R32IDZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; R32IDZFBFMIN-NEXT:    call __fixsfdi
-; R32IDZFBFMIN-NEXT:    lui a4, 524288
+; R32IDZFBFMIN-NEXT:    lui a3, 524288
 ; R32IDZFBFMIN-NEXT:    lui a2, 524288
 ; R32IDZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; R32IDZFBFMIN-NEXT:  # %bb.1: # %start
@@ -513,19 +513,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; R32IDZFBFMIN-NEXT:  .LBB10_2: # %start
 ; R32IDZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; R32IDZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; R32IDZFBFMIN-NEXT:    flt.s a3, fa5, fs0
-; R32IDZFBFMIN-NEXT:    beqz a3, .LBB10_4
+; R32IDZFBFMIN-NEXT:    flt.s a1, fa5, fs0
+; R32IDZFBFMIN-NEXT:    beqz a1, .LBB10_4
 ; R32IDZFBFMIN-NEXT:  # %bb.3:
-; R32IDZFBFMIN-NEXT:    addi a2, a4, -1
+; R32IDZFBFMIN-NEXT:    addi a2, a3, -1
 ; R32IDZFBFMIN-NEXT:  .LBB10_4: # %start
-; R32IDZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; R32IDZFBFMIN-NEXT:    feq.s a3, fs0, fs0
 ; R32IDZFBFMIN-NEXT:    neg a4, a1
-; R32IDZFBFMIN-NEXT:    and a1, a4, a2
-; R32IDZFBFMIN-NEXT:    neg a2, a3
-; R32IDZFBFMIN-NEXT:    neg a3, s0
+; R32IDZFBFMIN-NEXT:    neg a1, s0
+; R32IDZFBFMIN-NEXT:    neg a3, a3
+; R32IDZFBFMIN-NEXT:    and a0, a1, a0
+; R32IDZFBFMIN-NEXT:    and a1, a3, a2
+; R32IDZFBFMIN-NEXT:    or a0, a4, a0
 ; R32IDZFBFMIN-NEXT:    and a0, a3, a0
-; R32IDZFBFMIN-NEXT:    or a0, a2, a0
-; R32IDZFBFMIN-NEXT:    and a0, a4, a0
 ; R32IDZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; R32IDZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; R32IDZFBFMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -546,7 +546,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:    fle.s s0, fa5, fs0
 ; RV32ID-NEXT:    fmv.s fa0, fs0
 ; RV32ID-NEXT:    call __fixsfdi
-; RV32ID-NEXT:    lui a4, 524288
+; RV32ID-NEXT:    lui a3, 524288
 ; RV32ID-NEXT:    lui a2, 524288
 ; RV32ID-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-NEXT:  # %bb.1: # %start
@@ -554,19 +554,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:  .LBB10_2: # %start
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32ID-NEXT:    flt.s a3, fa5, fs0
-; RV32ID-NEXT:    beqz a3, .LBB10_4
+; RV32ID-NEXT:    flt.s a1, fa5, fs0
+; RV32ID-NEXT:    beqz a1, .LBB10_4
 ; RV32ID-NEXT:  # %bb.3:
-; RV32ID-NEXT:    addi a2, a4, -1
+; RV32ID-NEXT:    addi a2, a3, -1
 ; RV32ID-NEXT:  .LBB10_4: # %start
-; RV32ID-NEXT:    feq.s a1, fs0, fs0
+; RV32ID-NEXT:    feq.s a3, fs0, fs0
 ; RV32ID-NEXT:    neg a4, a1
-; RV32ID-NEXT:    and a1, a4, a2
-; RV32ID-NEXT:    neg a2, a3
-; RV32ID-NEXT:    neg a3, s0
+; RV32ID-NEXT:    neg a1, s0
+; RV32ID-NEXT:    neg a3, a3
+; RV32ID-NEXT:    and a0, a1, a0
+; RV32ID-NEXT:    and a1, a3, a2
+; RV32ID-NEXT:    or a0, a4, a0
 ; RV32ID-NEXT:    and a0, a3, a0
-; RV32ID-NEXT:    or a0, a2, a0
-; RV32ID-NEXT:    and a0, a4, a0
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -650,15 +650,15 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK32ZFBFMIN-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa0, fa0
-; CHECK32ZFBFMIN-NEXT:    flt.s a0, fa5, fa0
-; CHECK32ZFBFMIN-NEXT:    neg s0, a0
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa5, zero
-; CHECK32ZFBFMIN-NEXT:    fle.s a0, fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK32ZFBFMIN-NEXT:    fle.s a0, fa4, fa0
+; CHECK32ZFBFMIN-NEXT:    flt.s a1, fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    neg s0, a1
 ; CHECK32ZFBFMIN-NEXT:    neg s1, a0
 ; CHECK32ZFBFMIN-NEXT:    call __fixunssfdi
 ; CHECK32ZFBFMIN-NEXT:    and a0, s1, a0
-; CHECK32ZFBFMIN-NEXT:    or a0, s0, a0
 ; CHECK32ZFBFMIN-NEXT:    and a1, s1, a1
+; CHECK32ZFBFMIN-NEXT:    or a0, s0, a0
 ; CHECK32ZFBFMIN-NEXT:    or a1, s0, a1
 ; CHECK32ZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32ZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -672,20 +672,20 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
+; RV32ID-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV32ID-NEXT:    fmv.w.x fa5, zero
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI12_0)(a1)
 ; RV32ID-NEXT:    slli a0, a0, 16
 ; RV32ID-NEXT:    fmv.w.x fa0, a0
-; RV32ID-NEXT:    flt.s a0, fa5, fa0
-; RV32ID-NEXT:    neg s0, a0
-; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    fle.s a0, fa5, fa0
+; RV32ID-NEXT:    flt.s a1, fa4, fa0
+; RV32ID-NEXT:    neg s0, a1
 ; RV32ID-NEXT:    neg s1, a0
 ; RV32ID-NEXT:    call __fixunssfdi
 ; RV32ID-NEXT:    and a0, s1, a0
-; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    and a1, s1, a1
+; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    or a1, s0, a1
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1498,12 +1498,12 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    lui a0, 798720
+; CHECK32ZFBFMIN-NEXT:    lui a1, 274400
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK32ZFBFMIN-NEXT:    neg a0, a0
-; CHECK32ZFBFMIN-NEXT:    lui a1, 798720
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK32ZFBFMIN-NEXT:    lui a1, 274400
 ; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -1513,15 +1513,15 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; RV32ID-LABEL: fcvt_w_s_sat_i8:
 ; RV32ID:       # %bb.0: # %start
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
-; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    feq.s a0, fa5, fa5
-; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    lui a1, 798720
-; RV32ID-NEXT:    fmv.w.x fa4, a1
-; RV32ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-NEXT:    lui a1, 274400
+; RV32ID-NEXT:    slli a0, a0, 16
+; RV32ID-NEXT:    fmv.w.x fa4, a0
+; RV32ID-NEXT:    feq.s a0, fa4, fa4
+; RV32ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a1
+; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
@@ -1530,12 +1530,12 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK64ZFBFMIN-NEXT:    lui a0, 798720
+; CHECK64ZFBFMIN-NEXT:    lui a1, 274400
+; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK64ZFBFMIN-NEXT:    neg a0, a0
-; CHECK64ZFBFMIN-NEXT:    lui a1, 798720
-; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK64ZFBFMIN-NEXT:    lui a1, 274400
 ; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -1545,15 +1545,15 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    feq.s a0, fa5, fa5
-; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    lui a1, 798720
-; RV64ID-NEXT:    fmv.w.x fa4, a1
-; RV64ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-NEXT:    lui a1, 274400
+; RV64ID-NEXT:    slli a0, a0, 16
+; RV64ID-NEXT:    fmv.w.x fa4, a0
+; RV64ID-NEXT:    feq.s a0, fa4, fa4
+; RV64ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a1
+; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
@@ -1601,8 +1601,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    lui a0, 276464
+; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1611,11 +1611,11 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; RV32ID-LABEL: fcvt_wu_s_sat_i8:
 ; RV32ID:       # %bb.0: # %start
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
+; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    fmv.w.x fa4, zero
-; RV32ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    lui a0, 276464
+; RV32ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1625,8 +1625,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    lui a0, 276464
+; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -1635,11 +1635,11 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
+; RV64ID-NEXT:    fmv.w.x fa5, zero
 ; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    fmv.w.x fa4, zero
-; RV64ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    lui a0, 276464
+; RV64ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
diff --git a/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
index 9d5ba73de191d5..cc572ce489f62c 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
@@ -92,9 +92,9 @@ define i32 @fcmp_ord(bfloat %a, bfloat %b) nounwind {
 ; CHECK-LABEL: fcmp_ord:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
 ; CHECK-NEXT:    feq.s a0, fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    feq.s a1, fa5, fa5
+; CHECK-NEXT:    feq.s a1, fa4, fa4
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    ret
   %1 = fcmp ord bfloat %a, %b
@@ -186,9 +186,9 @@ define i32 @fcmp_uno(bfloat %a, bfloat %b) nounwind {
 ; CHECK-LABEL: fcmp_uno:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
 ; CHECK-NEXT:    feq.s a0, fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    feq.s a1, fa5, fa5
+; CHECK-NEXT:    feq.s a1, fa4, fa4
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    xori a0, a0, 1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
index a9ef261bb93024..f9cf4e523b77d4 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
@@ -50,10 +50,10 @@ define bfloat @flh_fsh_global(bfloat %a, bfloat %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
+; CHECK-NEXT:    lui a0, %hi(G)
 ; CHECK-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-NEXT:    flh fa4, %lo(G)(a0)
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    lui a0, %hi(G)
-; CHECK-NEXT:    flh fa5, %lo(G)(a0)
 ; CHECK-NEXT:    addi a1, a0, %lo(G)
 ; CHECK-NEXT:    fsh fa0, %lo(G)(a0)
 ; CHECK-NEXT:    flh fa5, 18(a1)
diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll
index 9dc8ce6be1ea61..c83b0ed6b0eee3 100644
--- a/llvm/test/CodeGen/RISCV/bfloat.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat.ll
@@ -342,8 +342,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV32ID-ILP32-NEXT:    addi sp, sp, -16
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
@@ -359,8 +359,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
@@ -378,8 +378,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a1, fa1
 ; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
@@ -398,8 +398,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
 ; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2
@@ -450,8 +450,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV32ID-ILP32-NEXT:    lhu a1, 6(a0)
 ; RV32ID-ILP32-NEXT:    lhu a0, 0(a0)
 ; RV32ID-ILP32-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
@@ -469,8 +469,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV64ID-LP64-NEXT:    lhu a1, 6(a0)
 ; RV64ID-LP64-NEXT:    lhu a0, 0(a0)
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
@@ -488,8 +488,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV32ID-ILP32D-NEXT:    lhu a1, 6(a0)
 ; RV32ID-ILP32D-NEXT:    lhu a0, 0(a0)
 ; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
@@ -508,8 +508,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV64ID-LP64D-NEXT:    lhu a1, 6(a0)
 ; RV64ID-LP64D-NEXT:    lhu a0, 0(a0)
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2
@@ -569,8 +569,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV32ID-ILP32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    mv s0, a0
 ; RV32ID-ILP32-NEXT:    slli a2, a2, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a2
 ; RV32ID-ILP32-NEXT:    slli a1, a1, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
@@ -589,8 +589,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    mv s0, a0
 ; RV64ID-LP64-NEXT:    slli a2, a2, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a2
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
@@ -611,8 +611,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a1, fa1
 ; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
@@ -633,8 +633,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
 ; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2
diff --git a/llvm/test/CodeGen/RISCV/bitextract-mac.ll b/llvm/test/CodeGen/RISCV/bitextract-mac.ll
index ce1e0c4711ffbe..41a32656e32578 100644
--- a/llvm/test/CodeGen/RISCV/bitextract-mac.ll
+++ b/llvm/test/CodeGen/RISCV/bitextract-mac.ll
@@ -25,8 +25,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    mul a0, a1, a0
 ; RV32I-NEXT:    slli a1, a0, 26
-; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    srli a0, a0, 25
 ; RV32I-NEXT:    mul a0, a1, a0
 ; RV32I-NEXT:    add a0, a0, a2
@@ -36,8 +36,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV32ZBB:       # %bb.0: # %entry
 ; RV32ZBB-NEXT:    mul a0, a1, a0
 ; RV32ZBB-NEXT:    slli a1, a0, 26
-; RV32ZBB-NEXT:    srli a1, a1, 28
 ; RV32ZBB-NEXT:    slli a0, a0, 20
+; RV32ZBB-NEXT:    srli a1, a1, 28
 ; RV32ZBB-NEXT:    srli a0, a0, 25
 ; RV32ZBB-NEXT:    mul a0, a1, a0
 ; RV32ZBB-NEXT:    add a0, a0, a2
@@ -56,8 +56,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV32XTHEADMAC:       # %bb.0: # %entry
 ; RV32XTHEADMAC-NEXT:    mul a0, a1, a0
 ; RV32XTHEADMAC-NEXT:    slli a1, a0, 26
-; RV32XTHEADMAC-NEXT:    srli a1, a1, 28
 ; RV32XTHEADMAC-NEXT:    slli a0, a0, 20
+; RV32XTHEADMAC-NEXT:    srli a1, a1, 28
 ; RV32XTHEADMAC-NEXT:    srli a0, a0, 25
 ; RV32XTHEADMAC-NEXT:    th.mulah a2, a1, a0
 ; RV32XTHEADMAC-NEXT:    mv a0, a2
@@ -76,8 +76,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    mul a0, a1, a0
 ; RV64I-NEXT:    slli a1, a0, 58
-; RV64I-NEXT:    srli a1, a1, 60
 ; RV64I-NEXT:    slli a0, a0, 52
+; RV64I-NEXT:    srli a1, a1, 60
 ; RV64I-NEXT:    srli a0, a0, 57
 ; RV64I-NEXT:    mul a0, a1, a0
 ; RV64I-NEXT:    addw a0, a0, a2
@@ -87,8 +87,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV64ZBB:       # %bb.0: # %entry
 ; RV64ZBB-NEXT:    mul a0, a1, a0
 ; RV64ZBB-NEXT:    slli a1, a0, 58
-; RV64ZBB-NEXT:    srli a1, a1, 60
 ; RV64ZBB-NEXT:    slli a0, a0, 52
+; RV64ZBB-NEXT:    srli a1, a1, 60
 ; RV64ZBB-NEXT:    srli a0, a0, 57
 ; RV64ZBB-NEXT:    mul a0, a1, a0
 ; RV64ZBB-NEXT:    addw a0, a0, a2
@@ -98,8 +98,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV64XTHEADMAC:       # %bb.0: # %entry
 ; RV64XTHEADMAC-NEXT:    mul a0, a1, a0
 ; RV64XTHEADMAC-NEXT:    slli a1, a0, 58
-; RV64XTHEADMAC-NEXT:    srli a1, a1, 60
 ; RV64XTHEADMAC-NEXT:    slli a0, a0, 52
+; RV64XTHEADMAC-NEXT:    srli a1, a1, 60
 ; RV64XTHEADMAC-NEXT:    srli a0, a0, 57
 ; RV64XTHEADMAC-NEXT:    th.mulah a2, a1, a0
 ; RV64XTHEADMAC-NEXT:    mv a0, a2
diff --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index 9c69fe0a6e4865..40a57721423453 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -57,11 +57,11 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
@@ -72,11 +72,11 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
@@ -102,53 +102,52 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    srli a5, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a2, a1, a2
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a1, a2
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bswap_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a3, a3, 24
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    and a5, a0, a5
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    or a1, a3, a1
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -171,18 +170,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV32I-LABEL: test_bitreverse_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a1, a0, 15
-; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    srli a0, a0, 28
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    andi a1, a0, 51
-; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    andi a0, a0, 51
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    andi a1, a0, 85
-; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    andi a0, a0, 85
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -190,18 +189,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV64I-LABEL: test_bitreverse_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a0, 15
-; RV64I-NEXT:    slli a1, a1, 4
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a1, a1, 4
 ; RV64I-NEXT:    srli a0, a0, 60
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 51
-; RV64I-NEXT:    slli a1, a1, 2
 ; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    slli a1, a1, 2
 ; RV64I-NEXT:    andi a0, a0, 51
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 85
-; RV64I-NEXT:    slli a1, a1, 1
 ; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    slli a1, a1, 1
 ; RV64I-NEXT:    andi a0, a0, 85
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -209,18 +208,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i8:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    andi a1, a0, 15
-; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    srli a0, a0, 28
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    andi a1, a0, 51
-; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    srli a0, a0, 2
+; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    andi a0, a0, 51
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    andi a1, a0, 85
-; RV32ZBB-NEXT:    slli a1, a1, 1
 ; RV32ZBB-NEXT:    srli a0, a0, 1
+; RV32ZBB-NEXT:    slli a1, a1, 1
 ; RV32ZBB-NEXT:    andi a0, a0, 85
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    ret
@@ -228,18 +227,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV64ZBB-LABEL: test_bitreverse_i8:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    andi a1, a0, 15
-; RV64ZBB-NEXT:    slli a1, a1, 4
 ; RV64ZBB-NEXT:    slli a0, a0, 56
+; RV64ZBB-NEXT:    slli a1, a1, 4
 ; RV64ZBB-NEXT:    srli a0, a0, 60
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    andi a1, a0, 51
-; RV64ZBB-NEXT:    slli a1, a1, 2
 ; RV64ZBB-NEXT:    srli a0, a0, 2
+; RV64ZBB-NEXT:    slli a1, a1, 2
 ; RV64ZBB-NEXT:    andi a0, a0, 51
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    andi a1, a0, 85
-; RV64ZBB-NEXT:    slli a1, a1, 1
 ; RV64ZBB-NEXT:    srli a0, a0, 1
+; RV64ZBB-NEXT:    slli a1, a1, 1
 ; RV64ZBB-NEXT:    andi a0, a0, 85
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -266,27 +265,27 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    lui a2, 1
 ; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    lui a2, 1
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -295,27 +294,27 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    lui a2, 1
 ; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 1
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -323,25 +322,25 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i16:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    rev8 a0, a0
-; RV32ZBB-NEXT:    srli a1, a0, 12
-; RV32ZBB-NEXT:    lui a2, 15
-; RV32ZBB-NEXT:    addi a2, a2, 240
-; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a1, 15
+; RV32ZBB-NEXT:    srli a2, a0, 12
+; RV32ZBB-NEXT:    addi a1, a1, 240
+; RV32ZBB-NEXT:    and a1, a2, a1
+; RV32ZBB-NEXT:    lui a2, 3
 ; RV32ZBB-NEXT:    srli a0, a0, 20
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    andi a0, a0, -241
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 3
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 5
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 5
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -349,25 +348,25 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV64ZBB-LABEL: test_bitreverse_i16:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    rev8 a0, a0
-; RV64ZBB-NEXT:    srli a1, a0, 44
-; RV64ZBB-NEXT:    lui a2, 15
-; RV64ZBB-NEXT:    addiw a2, a2, 240
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a1, 15
+; RV64ZBB-NEXT:    srli a2, a0, 44
+; RV64ZBB-NEXT:    addiw a1, a1, 240
+; RV64ZBB-NEXT:    and a1, a2, a1
+; RV64ZBB-NEXT:    lui a2, 3
 ; RV64ZBB-NEXT:    srli a0, a0, 52
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    andi a0, a0, -241
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 3
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 5
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 5
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -394,34 +393,34 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a3, a3, -241
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -430,34 +429,34 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, 1365
 ; RV64I-NEXT:    slliw a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slliw a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -465,25 +464,25 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i32:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    rev8 a0, a0
-; RV32ZBB-NEXT:    srli a1, a0, 4
-; RV32ZBB-NEXT:    lui a2, 61681
-; RV32ZBB-NEXT:    addi a2, a2, -241
-; RV32ZBB-NEXT:    and a1, a1, a2
-; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a1, 61681
+; RV32ZBB-NEXT:    srli a2, a0, 4
+; RV32ZBB-NEXT:    addi a1, a1, -241
+; RV32ZBB-NEXT:    and a2, a2, a1
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    lui a1, 209715
+; RV32ZBB-NEXT:    addi a1, a1, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
-; RV32ZBB-NEXT:    or a0, a1, a0
-; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 209715
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
-; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    srli a2, a0, 2
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    and a1, a2, a1
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 349525
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -491,28 +490,28 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV64ZBB-LABEL: test_bitreverse_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    rev8 a0, a0
-; RV64ZBB-NEXT:    srli a1, a0, 36
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    srli a0, a0, 28
+; RV64ZBB-NEXT:    lui a1, 61681
+; RV64ZBB-NEXT:    srli a2, a0, 36
+; RV64ZBB-NEXT:    addiw a1, a1, -241
+; RV64ZBB-NEXT:    and a1, a2, a1
 ; RV64ZBB-NEXT:    lui a2, 986895
+; RV64ZBB-NEXT:    srli a0, a0, 28
 ; RV64ZBB-NEXT:    addi a2, a2, 240
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 209715
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 349525
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slliw a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slliw a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -538,115 +537,114 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    lui a6, 61681
+; RV32I-NEXT:    srli a7, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    srli a4, a0, 24
+; RV32I-NEXT:    and a7, a7, a3
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    lui a7, 209715
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    lui a5, 349525
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    addi a6, a6, -241
+; RV32I-NEXT:    addi a7, a7, 819
+; RV32I-NEXT:    addi a5, a5, 1365
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a1, a1, a6
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    and a0, a0, a6
+; RV32I-NEXT:    and a2, a2, a6
 ; RV32I-NEXT:    slli a1, a1, 4
+; RV32I-NEXT:    and a3, a3, a6
+; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    lui a5, 209715
-; RV32I-NEXT:    addi a5, a5, 819
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a1, a1, a7
+; RV32I-NEXT:    srli a3, a0, 2
+; RV32I-NEXT:    and a0, a0, a7
+; RV32I-NEXT:    and a2, a2, a7
 ; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    and a3, a3, a7
+; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    lui a6, 349525
-; RV32I-NEXT:    addi a6, a6, 1365
-; RV32I-NEXT:    and a2, a2, a6
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    srli a7, a0, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a0, a0, 4
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    srli a3, a0, 1
 ; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    slli a0, a0, 2
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    and a0, a0, a6
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    and a2, a2, a5
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    slli a4, a0, 1
+; RV32I-NEXT:    or a0, a2, a1
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bitreverse_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
+; RV64I-NEXT:    srli a6, a0, 8
+; RV64I-NEXT:    srliw a7, a0, 24
+; RV64I-NEXT:    lui t0, 61681
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a6, a6, 24
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lui a6, 349525
+; RV64I-NEXT:    and a5, a0, a5
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    addiw t0, t0, -241
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    addiw a6, a6, 1365
 ; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a5, a5, a7
+; RV64I-NEXT:    slli a7, t0, 32
+; RV64I-NEXT:    add a7, t0, a7
+; RV64I-NEXT:    slli t0, a3, 32
+; RV64I-NEXT:    add a3, a3, t0
+; RV64I-NEXT:    slli t0, a6, 32
+; RV64I-NEXT:    add a6, a6, t0
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a7
+; RV64I-NEXT:    and a1, a1, a7
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a6
+; RV64I-NEXT:    and a1, a1, a6
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -654,74 +652,73 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i64:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    rev8 a1, a1
-; RV32ZBB-NEXT:    srli a2, a1, 4
-; RV32ZBB-NEXT:    lui a3, 61681
-; RV32ZBB-NEXT:    addi a3, a3, -241
-; RV32ZBB-NEXT:    and a2, a2, a3
-; RV32ZBB-NEXT:    and a1, a1, a3
-; RV32ZBB-NEXT:    slli a1, a1, 4
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 2
-; RV32ZBB-NEXT:    lui a4, 209715
-; RV32ZBB-NEXT:    addi a4, a4, 819
-; RV32ZBB-NEXT:    and a2, a2, a4
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    slli a1, a1, 2
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 1
-; RV32ZBB-NEXT:    lui a5, 349525
-; RV32ZBB-NEXT:    addi a5, a5, 1365
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a1, a1, a5
-; RV32ZBB-NEXT:    slli a1, a1, 1
-; RV32ZBB-NEXT:    or a2, a2, a1
+; RV32ZBB-NEXT:    lui a2, 61681
+; RV32ZBB-NEXT:    lui a3, 209715
 ; RV32ZBB-NEXT:    rev8 a0, a0
-; RV32ZBB-NEXT:    srli a1, a0, 4
+; RV32ZBB-NEXT:    srli a4, a1, 4
+; RV32ZBB-NEXT:    addi a2, a2, -241
+; RV32ZBB-NEXT:    srli a5, a0, 4
+; RV32ZBB-NEXT:    and a4, a4, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    and a5, a5, a2
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a3, a3, 819
+; RV32ZBB-NEXT:    addi a2, a2, 1365
+; RV32ZBB-NEXT:    slli a1, a1, 4
+; RV32ZBB-NEXT:    slli a0, a0, 4
+; RV32ZBB-NEXT:    or a1, a4, a1
+; RV32ZBB-NEXT:    or a0, a5, a0
+; RV32ZBB-NEXT:    srli a4, a1, 2
 ; RV32ZBB-NEXT:    and a1, a1, a3
+; RV32ZBB-NEXT:    srli a5, a0, 2
 ; RV32ZBB-NEXT:    and a0, a0, a3
-; RV32ZBB-NEXT:    slli a0, a0, 4
-; RV32ZBB-NEXT:    or a0, a1, a0
-; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    and a0, a0, a4
+; RV32ZBB-NEXT:    and a4, a4, a3
+; RV32ZBB-NEXT:    slli a1, a1, 2
+; RV32ZBB-NEXT:    and a3, a5, a3
 ; RV32ZBB-NEXT:    slli a0, a0, 2
-; RV32ZBB-NEXT:    or a0, a1, a0
-; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    and a1, a1, a5
-; RV32ZBB-NEXT:    and a0, a0, a5
-; RV32ZBB-NEXT:    slli a0, a0, 1
-; RV32ZBB-NEXT:    or a1, a1, a0
-; RV32ZBB-NEXT:    mv a0, a2
+; RV32ZBB-NEXT:    or a1, a4, a1
+; RV32ZBB-NEXT:    or a0, a3, a0
+; RV32ZBB-NEXT:    srli a3, a1, 1
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    srli a4, a0, 1
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a3, a3, a2
+; RV32ZBB-NEXT:    slli a1, a1, 1
+; RV32ZBB-NEXT:    and a2, a4, a2
+; RV32ZBB-NEXT:    slli a4, a0, 1
+; RV32ZBB-NEXT:    or a0, a3, a1
+; RV32ZBB-NEXT:    or a1, a2, a4
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: test_bitreverse_i64:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    rev8 a0, a0
-; RV64ZBB-NEXT:    srli a1, a0, 4
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
-; RV64ZBB-NEXT:    slli a0, a0, 4
-; RV64ZBB-NEXT:    or a0, a1, a0
-; RV64ZBB-NEXT:    srli a1, a0, 2
+; RV64ZBB-NEXT:    lui a1, 61681
 ; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    lui a3, 349525
+; RV64ZBB-NEXT:    addiw a1, a1, -241
 ; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    addiw a3, a3, 1365
+; RV64ZBB-NEXT:    slli a4, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a4
+; RV64ZBB-NEXT:    slli a4, a2, 32
+; RV64ZBB-NEXT:    add a2, a2, a4
+; RV64ZBB-NEXT:    slli a4, a3, 32
+; RV64ZBB-NEXT:    add a3, a3, a4
+; RV64ZBB-NEXT:    srli a4, a0, 4
+; RV64ZBB-NEXT:    and a4, a4, a1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    slli a0, a0, 4
+; RV64ZBB-NEXT:    or a0, a4, a0
+; RV64ZBB-NEXT:    srli a1, a0, 2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a0, a0, a3
+; RV64ZBB-NEXT:    and a1, a1, a3
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -729,10 +726,9 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_i64:
 ; RV32ZBKB:       # %bb.0:
 ; RV32ZBKB-NEXT:    rev8 a1, a1
-; RV32ZBKB-NEXT:    brev8 a2, a1
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a1, a0
-; RV32ZBKB-NEXT:    mv a0, a2
+; RV32ZBKB-NEXT:    rev8 a2, a0
+; RV32ZBKB-NEXT:    brev8 a0, a1
+; RV32ZBKB-NEXT:    brev8 a1, a2
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_i64:
@@ -752,20 +748,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -777,20 +773,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -802,20 +798,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 3
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 3
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 5
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 5
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -827,20 +823,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 3
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slli a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 3
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 5
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 5
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -867,20 +863,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -892,20 +888,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slliw a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slliw a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -917,20 +913,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 209715
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 209715
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 349525
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -942,20 +938,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slliw a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 209715
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 349525
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slliw a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slliw a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -979,69 +975,69 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 4
 ; RV32I-NEXT:    addi a3, a3, -241
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    srli a5, a1, 2
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a5, 349525
-; RV32I-NEXT:    addi a5, a5, 1365
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    srli a4, a1, 1
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bswap_bitreverse_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    srli a4, a0, 4
+; RV64I-NEXT:    and a4, a4, a1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1050,69 +1046,69 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srli a2, a0, 4
 ; RV32ZBB-NEXT:    lui a3, 61681
+; RV32ZBB-NEXT:    lui a4, 209715
+; RV32ZBB-NEXT:    srli a5, a1, 4
 ; RV32ZBB-NEXT:    addi a3, a3, -241
 ; RV32ZBB-NEXT:    and a2, a2, a3
 ; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    and a5, a5, a3
+; RV32ZBB-NEXT:    and a1, a1, a3
+; RV32ZBB-NEXT:    lui a3, 349525
+; RV32ZBB-NEXT:    addi a4, a4, 819
+; RV32ZBB-NEXT:    addi a3, a3, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 4
+; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a5, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 2
-; RV32ZBB-NEXT:    lui a4, 209715
-; RV32ZBB-NEXT:    addi a4, a4, 819
-; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    and a0, a0, a4
+; RV32ZBB-NEXT:    srli a5, a1, 2
+; RV32ZBB-NEXT:    and a1, a1, a4
+; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    slli a0, a0, 2
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a4, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 1
-; RV32ZBB-NEXT:    lui a5, 349525
-; RV32ZBB-NEXT:    addi a5, a5, 1365
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a0, a0, a5
-; RV32ZBB-NEXT:    slli a0, a0, 1
-; RV32ZBB-NEXT:    or a0, a2, a0
-; RV32ZBB-NEXT:    srli a2, a1, 4
-; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    srli a4, a1, 1
 ; RV32ZBB-NEXT:    and a1, a1, a3
-; RV32ZBB-NEXT:    slli a1, a1, 4
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 2
-; RV32ZBB-NEXT:    and a2, a2, a4
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    slli a1, a1, 2
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 1
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a1, a1, a5
+; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    slli a0, a0, 1
+; RV32ZBB-NEXT:    and a3, a4, a3
 ; RV32ZBB-NEXT:    slli a1, a1, 1
-; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a3, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: test_bswap_bitreverse_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    srli a1, a0, 4
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
-; RV64ZBB-NEXT:    slli a0, a0, 4
-; RV64ZBB-NEXT:    or a0, a1, a0
-; RV64ZBB-NEXT:    srli a1, a0, 2
+; RV64ZBB-NEXT:    lui a1, 61681
 ; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    lui a3, 349525
+; RV64ZBB-NEXT:    addiw a1, a1, -241
 ; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    addiw a3, a3, 1365
+; RV64ZBB-NEXT:    slli a4, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a4
+; RV64ZBB-NEXT:    slli a4, a2, 32
+; RV64ZBB-NEXT:    add a2, a2, a4
+; RV64ZBB-NEXT:    slli a4, a3, 32
+; RV64ZBB-NEXT:    add a3, a3, a4
+; RV64ZBB-NEXT:    srli a4, a0, 4
+; RV64ZBB-NEXT:    and a4, a4, a1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    slli a0, a0, 4
+; RV64ZBB-NEXT:    or a0, a4, a0
+; RV64ZBB-NEXT:    srli a1, a0, 2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a0, a0, a3
+; RV64ZBB-NEXT:    and a1, a1, a3
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -1140,20 +1136,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -1165,20 +1161,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1190,20 +1186,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 3
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 3
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 5
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 5
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -1215,20 +1211,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 3
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slli a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 3
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 5
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 5
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -1255,20 +1251,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -1280,20 +1276,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slliw a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slliw a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1305,20 +1301,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 209715
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 209715
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 349525
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -1330,20 +1326,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slliw a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 209715
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 349525
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slliw a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slliw a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -1367,69 +1363,69 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 4
 ; RV32I-NEXT:    addi a3, a3, -241
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    srli a5, a1, 2
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a5, 349525
-; RV32I-NEXT:    addi a5, a5, 1365
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    srli a4, a1, 1
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bitreverse_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    srli a4, a0, 4
+; RV64I-NEXT:    and a4, a4, a1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1438,69 +1434,69 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srli a2, a0, 4
 ; RV32ZBB-NEXT:    lui a3, 61681
+; RV32ZBB-NEXT:    lui a4, 209715
+; RV32ZBB-NEXT:    srli a5, a1, 4
 ; RV32ZBB-NEXT:    addi a3, a3, -241
 ; RV32ZBB-NEXT:    and a2, a2, a3
 ; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    and a5, a5, a3
+; RV32ZBB-NEXT:    and a1, a1, a3
+; RV32ZBB-NEXT:    lui a3, 349525
+; RV32ZBB-NEXT:    addi a4, a4, 819
+; RV32ZBB-NEXT:    addi a3, a3, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 4
+; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a5, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 2
-; RV32ZBB-NEXT:    lui a4, 209715
-; RV32ZBB-NEXT:    addi a4, a4, 819
-; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    and a0, a0, a4
+; RV32ZBB-NEXT:    srli a5, a1, 2
+; RV32ZBB-NEXT:    and a1, a1, a4
+; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    slli a0, a0, 2
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a4, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 1
-; RV32ZBB-NEXT:    lui a5, 349525
-; RV32ZBB-NEXT:    addi a5, a5, 1365
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a0, a0, a5
-; RV32ZBB-NEXT:    slli a0, a0, 1
-; RV32ZBB-NEXT:    or a0, a2, a0
-; RV32ZBB-NEXT:    srli a2, a1, 4
-; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    srli a4, a1, 1
 ; RV32ZBB-NEXT:    and a1, a1, a3
-; RV32ZBB-NEXT:    slli a1, a1, 4
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 2
-; RV32ZBB-NEXT:    and a2, a2, a4
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    slli a1, a1, 2
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 1
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a1, a1, a5
+; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    slli a0, a0, 1
+; RV32ZBB-NEXT:    and a3, a4, a3
 ; RV32ZBB-NEXT:    slli a1, a1, 1
-; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a3, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: test_bitreverse_bswap_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    srli a1, a0, 4
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
-; RV64ZBB-NEXT:    slli a0, a0, 4
-; RV64ZBB-NEXT:    or a0, a1, a0
-; RV64ZBB-NEXT:    srli a1, a0, 2
+; RV64ZBB-NEXT:    lui a1, 61681
 ; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    lui a3, 349525
+; RV64ZBB-NEXT:    addiw a1, a1, -241
 ; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    addiw a3, a3, 1365
+; RV64ZBB-NEXT:    slli a4, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a4
+; RV64ZBB-NEXT:    slli a4, a2, 32
+; RV64ZBB-NEXT:    add a2, a2, a4
+; RV64ZBB-NEXT:    slli a4, a3, 32
+; RV64ZBB-NEXT:    add a3, a3, a4
+; RV64ZBB-NEXT:    srli a4, a0, 4
+; RV64ZBB-NEXT:    and a4, a4, a1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    slli a0, a0, 4
+; RV64ZBB-NEXT:    or a0, a4, a0
+; RV64ZBB-NEXT:    srli a1, a0, 2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a0, a0, a3
+; RV64ZBB-NEXT:    and a1, a1, a3
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
index cccb69d2e6986a..541c9b4d40c7e1 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
@@ -333,8 +333,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a0, 5
-; RV32I-NEXT:    addi t0, a0, -1792
+; RV32I-NEXT:    lui a7, 5
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    li a1, 2
 ; RV32I-NEXT:    li a2, 3
@@ -342,6 +341,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 6
 ; RV32I-NEXT:    li a6, 7
+; RV32I-NEXT:    addi t0, a7, -1792
 ; RV32I-NEXT:    li a7, 8
 ; RV32I-NEXT:    sw t0, 0(sp)
 ; RV32I-NEXT:    call callee_half_on_stack
@@ -353,8 +353,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a0, 5
-; RV64I-NEXT:    addiw t0, a0, -1792
+; RV64I-NEXT:    lui a7, 5
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
@@ -362,6 +361,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
 ; RV64I-NEXT:    li a6, 7
+; RV64I-NEXT:    addiw t0, a7, -1792
 ; RV64I-NEXT:    li a7, 8
 ; RV64I-NEXT:    sd t0, 0(sp)
 ; RV64I-NEXT:    call callee_half_on_stack
@@ -373,8 +373,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    lui a0, 1048565
-; RV32IF-NEXT:    addi t0, a0, -1792
+; RV32IF-NEXT:    lui a7, 1048565
 ; RV32IF-NEXT:    li a0, 1
 ; RV32IF-NEXT:    li a1, 2
 ; RV32IF-NEXT:    li a2, 3
@@ -382,6 +381,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32IF-NEXT:    li a4, 5
 ; RV32IF-NEXT:    li a5, 6
 ; RV32IF-NEXT:    li a6, 7
+; RV32IF-NEXT:    addi t0, a7, -1792
 ; RV32IF-NEXT:    li a7, 8
 ; RV32IF-NEXT:    sw t0, 0(sp)
 ; RV32IF-NEXT:    call callee_half_on_stack
@@ -393,8 +393,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    addi sp, sp, -16
 ; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64IF-NEXT:    lui a0, 1048565
-; RV64IF-NEXT:    addi t0, a0, -1792
+; RV64IF-NEXT:    lui a7, 1048565
 ; RV64IF-NEXT:    li a0, 1
 ; RV64IF-NEXT:    li a1, 2
 ; RV64IF-NEXT:    li a2, 3
@@ -402,6 +401,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64IF-NEXT:    li a4, 5
 ; RV64IF-NEXT:    li a5, 6
 ; RV64IF-NEXT:    li a6, 7
+; RV64IF-NEXT:    addi t0, a7, -1792
 ; RV64IF-NEXT:    li a7, 8
 ; RV64IF-NEXT:    sw t0, 0(sp)
 ; RV64IF-NEXT:    call callee_half_on_stack
@@ -413,12 +413,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32-ILP32F:       # %bb.0:
 ; RV32-ILP32F-NEXT:    addi sp, sp, -16
 ; RV32-ILP32F-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32F-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32-ILP32F-NEXT:    flw fa0, %lo(.LCPI3_0)(a0)
+; RV32-ILP32F-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV32-ILP32F-NEXT:    li a0, 1
 ; RV32-ILP32F-NEXT:    li a1, 2
 ; RV32-ILP32F-NEXT:    li a2, 3
 ; RV32-ILP32F-NEXT:    li a3, 4
+; RV32-ILP32F-NEXT:    flw fa0, %lo(.LCPI3_0)(a4)
 ; RV32-ILP32F-NEXT:    li a4, 5
 ; RV32-ILP32F-NEXT:    li a5, 6
 ; RV32-ILP32F-NEXT:    li a6, 7
@@ -432,12 +432,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64-LP64F:       # %bb.0:
 ; RV64-LP64F-NEXT:    addi sp, sp, -16
 ; RV64-LP64F-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-LP64F-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64-LP64F-NEXT:    flw fa0, %lo(.LCPI3_0)(a0)
+; RV64-LP64F-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV64-LP64F-NEXT:    li a0, 1
 ; RV64-LP64F-NEXT:    li a1, 2
 ; RV64-LP64F-NEXT:    li a2, 3
 ; RV64-LP64F-NEXT:    li a3, 4
+; RV64-LP64F-NEXT:    flw fa0, %lo(.LCPI3_0)(a4)
 ; RV64-LP64F-NEXT:    li a4, 5
 ; RV64-LP64F-NEXT:    li a5, 6
 ; RV64-LP64F-NEXT:    li a6, 7
@@ -451,12 +451,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32-ILP32ZFHMIN:       # %bb.0:
 ; RV32-ILP32ZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32-ILP32ZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32ZFHMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32-ILP32ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a0)
+; RV32-ILP32ZFHMIN-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV32-ILP32ZFHMIN-NEXT:    li a0, 1
 ; RV32-ILP32ZFHMIN-NEXT:    li a1, 2
 ; RV32-ILP32ZFHMIN-NEXT:    li a2, 3
 ; RV32-ILP32ZFHMIN-NEXT:    li a3, 4
+; RV32-ILP32ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a4)
 ; RV32-ILP32ZFHMIN-NEXT:    li a4, 5
 ; RV32-ILP32ZFHMIN-NEXT:    li a5, 6
 ; RV32-ILP32ZFHMIN-NEXT:    li a6, 7
@@ -470,12 +470,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64-LP64ZFHMIN:       # %bb.0:
 ; RV64-LP64ZFHMIN-NEXT:    addi sp, sp, -16
 ; RV64-LP64ZFHMIN-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-LP64ZFHMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64-LP64ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a0)
+; RV64-LP64ZFHMIN-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV64-LP64ZFHMIN-NEXT:    li a0, 1
 ; RV64-LP64ZFHMIN-NEXT:    li a1, 2
 ; RV64-LP64ZFHMIN-NEXT:    li a2, 3
 ; RV64-LP64ZFHMIN-NEXT:    li a3, 4
+; RV64-LP64ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a4)
 ; RV64-LP64ZFHMIN-NEXT:    li a4, 5
 ; RV64-LP64ZFHMIN-NEXT:    li a5, 6
 ; RV64-LP64ZFHMIN-NEXT:    li a6, 7
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
index e97a3bff32fac7..9387b7ef4c32ec 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
@@ -145,43 +145,45 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    li a0, 18
-; RV32I-FPELIM-NEXT:    li a1, 17
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 24(sp)
-; RV32I-FPELIM-NEXT:    li a0, 16
-; RV32I-FPELIM-NEXT:    lui a1, 262236
-; RV32I-FPELIM-NEXT:    addi a1, a1, 655
-; RV32I-FPELIM-NEXT:    lui a2, 377487
-; RV32I-FPELIM-NEXT:    addi a2, a2, 1475
-; RV32I-FPELIM-NEXT:    li a3, 15
-; RV32I-FPELIM-NEXT:    sw a3, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a2, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
-; RV32I-FPELIM-NEXT:    lui a0, 262153
-; RV32I-FPELIM-NEXT:    addi t0, a0, 491
-; RV32I-FPELIM-NEXT:    lui a0, 545260
-; RV32I-FPELIM-NEXT:    addi t1, a0, -1967
-; RV32I-FPELIM-NEXT:    lui a0, 964690
-; RV32I-FPELIM-NEXT:    addi t2, a0, -328
-; RV32I-FPELIM-NEXT:    lui a0, 335544
-; RV32I-FPELIM-NEXT:    addi t3, a0, 1311
-; RV32I-FPELIM-NEXT:    lui a0, 688509
-; RV32I-FPELIM-NEXT:    addi a5, a0, -2048
+; RV32I-FPELIM-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-FPELIM-NEXT:    li a5, 18
+; RV32I-FPELIM-NEXT:    li a6, 17
+; RV32I-FPELIM-NEXT:    li a7, 16
+; RV32I-FPELIM-NEXT:    lui t0, 262236
+; RV32I-FPELIM-NEXT:    lui t1, 377487
+; RV32I-FPELIM-NEXT:    li t2, 15
+; RV32I-FPELIM-NEXT:    lui t3, 262153
+; RV32I-FPELIM-NEXT:    lui t4, 545260
+; RV32I-FPELIM-NEXT:    lui t5, 964690
+; RV32I-FPELIM-NEXT:    lui t6, 335544
+; RV32I-FPELIM-NEXT:    lui s0, 688509
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 11
 ; RV32I-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32I-FPELIM-NEXT:    li a3, 12
 ; RV32I-FPELIM-NEXT:    li a4, 13
+; RV32I-FPELIM-NEXT:    sw a6, 20(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 24(sp)
 ; RV32I-FPELIM-NEXT:    li a6, 4
+; RV32I-FPELIM-NEXT:    addi a5, t0, 655
+; RV32I-FPELIM-NEXT:    addi t0, t1, 1475
+; RV32I-FPELIM-NEXT:    sw t2, 0(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 8(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 16(sp)
 ; RV32I-FPELIM-NEXT:    li a7, 14
+; RV32I-FPELIM-NEXT:    addi t0, t3, 491
+; RV32I-FPELIM-NEXT:    addi t1, t4, -1967
+; RV32I-FPELIM-NEXT:    addi t2, t5, -328
+; RV32I-FPELIM-NEXT:    addi t3, t6, 1311
+; RV32I-FPELIM-NEXT:    addi a5, s0, -2048
 ; RV32I-FPELIM-NEXT:    sw t3, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw t2, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw t0, 44(sp)
 ; RV32I-FPELIM-NEXT:    call callee_aligned_stack
 ; RV32I-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-FPELIM-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 64
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -190,38 +192,39 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    addi sp, sp, -64
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-WITHFP-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    li a0, 18
-; RV32I-WITHFP-NEXT:    li a1, 17
-; RV32I-WITHFP-NEXT:    sw a1, 20(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 24(sp)
-; RV32I-WITHFP-NEXT:    li a0, 16
-; RV32I-WITHFP-NEXT:    lui a1, 262236
-; RV32I-WITHFP-NEXT:    addi a1, a1, 655
-; RV32I-WITHFP-NEXT:    lui a2, 377487
-; RV32I-WITHFP-NEXT:    addi a2, a2, 1475
-; RV32I-WITHFP-NEXT:    li a3, 15
-; RV32I-WITHFP-NEXT:    sw a3, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a2, 8(sp)
-; RV32I-WITHFP-NEXT:    sw a1, 12(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 16(sp)
-; RV32I-WITHFP-NEXT:    lui a0, 262153
-; RV32I-WITHFP-NEXT:    addi t0, a0, 491
-; RV32I-WITHFP-NEXT:    lui a0, 545260
-; RV32I-WITHFP-NEXT:    addi t1, a0, -1967
-; RV32I-WITHFP-NEXT:    lui a0, 964690
-; RV32I-WITHFP-NEXT:    addi t2, a0, -328
-; RV32I-WITHFP-NEXT:    lui a0, 335544
-; RV32I-WITHFP-NEXT:    addi t3, a0, 1311
-; RV32I-WITHFP-NEXT:    lui a0, 688509
-; RV32I-WITHFP-NEXT:    addi a5, a0, -2048
+; RV32I-WITHFP-NEXT:    li a5, 18
+; RV32I-WITHFP-NEXT:    li a6, 17
+; RV32I-WITHFP-NEXT:    li a7, 16
+; RV32I-WITHFP-NEXT:    lui t0, 262236
+; RV32I-WITHFP-NEXT:    lui t1, 377487
+; RV32I-WITHFP-NEXT:    li t2, 15
+; RV32I-WITHFP-NEXT:    lui t3, 262153
+; RV32I-WITHFP-NEXT:    lui t4, 545260
+; RV32I-WITHFP-NEXT:    lui t5, 964690
+; RV32I-WITHFP-NEXT:    lui t6, 335544
+; RV32I-WITHFP-NEXT:    lui s1, 688509
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 11
 ; RV32I-WITHFP-NEXT:    addi a2, s0, -32
 ; RV32I-WITHFP-NEXT:    li a3, 12
 ; RV32I-WITHFP-NEXT:    li a4, 13
+; RV32I-WITHFP-NEXT:    sw a6, 20(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 24(sp)
 ; RV32I-WITHFP-NEXT:    li a6, 4
+; RV32I-WITHFP-NEXT:    addi a5, t0, 655
+; RV32I-WITHFP-NEXT:    addi t0, t1, 1475
+; RV32I-WITHFP-NEXT:    sw t2, 0(sp)
+; RV32I-WITHFP-NEXT:    sw t0, 8(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 12(sp)
+; RV32I-WITHFP-NEXT:    sw a7, 16(sp)
 ; RV32I-WITHFP-NEXT:    li a7, 14
+; RV32I-WITHFP-NEXT:    addi t0, t3, 491
+; RV32I-WITHFP-NEXT:    addi t1, t4, -1967
+; RV32I-WITHFP-NEXT:    addi t2, t5, -328
+; RV32I-WITHFP-NEXT:    addi t3, t6, 1311
+; RV32I-WITHFP-NEXT:    addi a5, s1, -2048
 ; RV32I-WITHFP-NEXT:    sw t3, -32(s0)
 ; RV32I-WITHFP-NEXT:    sw t2, -28(s0)
 ; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
@@ -229,6 +232,7 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    call callee_aligned_stack
 ; RV32I-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-WITHFP-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 64
 ; RV32I-WITHFP-NEXT:    ret
   %1 = call i32 @callee_aligned_stack(i32 1, i32 11,
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 9e4c8a6e3320c2..18916dd69eb43a 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -86,15 +86,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-FPELIM-NEXT:    lw t1, 0(sp)
 ; RV32I-FPELIM-NEXT:    andi a0, a0, 255
 ; RV32I-FPELIM-NEXT:    slli a1, a1, 16
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
 ; RV32I-FPELIM-NEXT:    srli a1, a1, 16
 ; RV32I-FPELIM-NEXT:    add a0, a0, a2
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a1, a4, t1
-; RV32I-FPELIM-NEXT:    xor a2, a3, a7
-; RV32I-FPELIM-NEXT:    or a1, a2, a1
-; RV32I-FPELIM-NEXT:    seqz a1, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a5
+; RV32I-FPELIM-NEXT:    xor a1, a4, t1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a6
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    seqz a1, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, t0
 ; RV32I-FPELIM-NEXT:    add a0, a1, a0
 ; RV32I-FPELIM-NEXT:    ret
@@ -109,15 +109,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-WITHFP-NEXT:    lw t1, 0(s0)
 ; RV32I-WITHFP-NEXT:    andi a0, a0, 255
 ; RV32I-WITHFP-NEXT:    slli a1, a1, 16
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
 ; RV32I-WITHFP-NEXT:    srli a1, a1, 16
 ; RV32I-WITHFP-NEXT:    add a0, a0, a2
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a1, a4, t1
-; RV32I-WITHFP-NEXT:    xor a2, a3, a7
-; RV32I-WITHFP-NEXT:    or a1, a2, a1
-; RV32I-WITHFP-NEXT:    seqz a1, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a5
+; RV32I-WITHFP-NEXT:    xor a1, a4, t1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a6
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    seqz a1, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, t0
 ; RV32I-WITHFP-NEXT:    add a0, a1, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a3, 4(a1)
+; RV32I-FPELIM-NEXT:    lw a4, 8(a1)
+; RV32I-FPELIM-NEXT:    lw a1, 12(a1)
 ; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a6, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a7, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
-; RV32I-FPELIM-NEXT:    xor a3, a3, a7
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
-; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a1, a2, a6
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a7, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    xor a1, a5, a1
+; RV32I-FPELIM-NEXT:    xor a3, a6, a3
+; RV32I-FPELIM-NEXT:    xor a4, a7, a4
+; RV32I-FPELIM-NEXT:    xor a0, a0, a2
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    or a0, a0, a4
+; RV32I-FPELIM-NEXT:    or a0, a0, a1
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a3, 4(a1)
+; RV32I-WITHFP-NEXT:    lw a4, 8(a1)
+; RV32I-WITHFP-NEXT:    lw a1, 12(a1)
 ; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a6, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a7, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
-; RV32I-WITHFP-NEXT:    xor a3, a3, a7
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
-; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a1, a2, a6
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a7, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    xor a1, a5, a1
+; RV32I-WITHFP-NEXT:    xor a3, a6, a3
+; RV32I-WITHFP-NEXT:    xor a4, a7, a4
+; RV32I-WITHFP-NEXT:    xor a0, a0, a2
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    or a0, a0, a4
+; RV32I-WITHFP-NEXT:    or a0, a0, a1
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -245,13 +245,13 @@ define i32 @caller_large_scalars() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
 ; RV32I-FPELIM-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    lui a0, 524272
+; RV32I-FPELIM-NEXT:    lui a1, 524272
+; RV32I-FPELIM-NEXT:    li a2, 1
+; RV32I-FPELIM-NEXT:    addi a0, sp, 24
 ; RV32I-FPELIM-NEXT:    sw zero, 0(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 4(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    li a2, 1
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
 ; RV32I-FPELIM-NEXT:    mv a1, sp
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 28(sp)
@@ -268,13 +268,13 @@ define i32 @caller_large_scalars() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 48
-; RV32I-WITHFP-NEXT:    lui a0, 524272
+; RV32I-WITHFP-NEXT:    lui a1, 524272
+; RV32I-WITHFP-NEXT:    li a2, 1
+; RV32I-WITHFP-NEXT:    addi a0, s0, -24
 ; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
-; RV32I-WITHFP-NEXT:    sw a0, -36(s0)
-; RV32I-WITHFP-NEXT:    li a2, 1
-; RV32I-WITHFP-NEXT:    addi a0, s0, -24
+; RV32I-WITHFP-NEXT:    sw a1, -36(s0)
 ; RV32I-WITHFP-NEXT:    addi a1, s0, -48
 ; RV32I-WITHFP-NEXT:    sw a2, -24(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -20(s0)
@@ -299,18 +299,18 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
 ; RV32I-FPELIM-NEXT:    lw a1, 0(a7)
 ; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
-; RV32I-FPELIM-NEXT:    lw a3, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a3, 8(a7)
 ; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
 ; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a3, a4, a3
+; RV32I-FPELIM-NEXT:    lw a7, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    xor a4, a4, a5
 ; RV32I-FPELIM-NEXT:    xor a2, a2, a6
-; RV32I-FPELIM-NEXT:    or a2, a2, a3
-; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a1, a5
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
+; RV32I-FPELIM-NEXT:    xor a0, a1, a0
+; RV32I-FPELIM-NEXT:    or a2, a2, a4
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
@@ -324,18 +324,18 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
 ; RV32I-WITHFP-NEXT:    lw a1, 0(a7)
 ; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
-; RV32I-WITHFP-NEXT:    lw a3, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a3, 8(a7)
 ; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
 ; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a3, a4, a3
+; RV32I-WITHFP-NEXT:    lw a7, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    xor a4, a4, a5
 ; RV32I-WITHFP-NEXT:    xor a2, a2, a6
-; RV32I-WITHFP-NEXT:    or a2, a2, a3
-; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a1, a5
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
+; RV32I-WITHFP-NEXT:    xor a0, a1, a0
+; RV32I-WITHFP-NEXT:    or a2, a2, a4
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -353,25 +353,25 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    addi a0, sp, 16
-; RV32I-FPELIM-NEXT:    li a1, 9
-; RV32I-FPELIM-NEXT:    sw a1, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lui a0, 524272
-; RV32I-FPELIM-NEXT:    sw zero, 16(sp)
-; RV32I-FPELIM-NEXT:    sw zero, 20(sp)
-; RV32I-FPELIM-NEXT:    sw zero, 24(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 28(sp)
-; RV32I-FPELIM-NEXT:    li t0, 8
+; RV32I-FPELIM-NEXT:    addi a6, sp, 16
+; RV32I-FPELIM-NEXT:    li a7, 9
+; RV32I-FPELIM-NEXT:    lui t0, 524272
+; RV32I-FPELIM-NEXT:    li t1, 8
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 2
 ; RV32I-FPELIM-NEXT:    li a2, 3
 ; RV32I-FPELIM-NEXT:    li a3, 4
 ; RV32I-FPELIM-NEXT:    li a4, 5
 ; RV32I-FPELIM-NEXT:    li a5, 6
+; RV32I-FPELIM-NEXT:    sw a7, 0(sp)
+; RV32I-FPELIM-NEXT:    sw a6, 4(sp)
 ; RV32I-FPELIM-NEXT:    li a6, 7
+; RV32I-FPELIM-NEXT:    sw zero, 16(sp)
+; RV32I-FPELIM-NEXT:    sw zero, 20(sp)
+; RV32I-FPELIM-NEXT:    sw zero, 24(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 28(sp)
 ; RV32I-FPELIM-NEXT:    addi a7, sp, 40
-; RV32I-FPELIM-NEXT:    sw t0, 40(sp)
+; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 48(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 52(sp)
@@ -386,25 +386,25 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    addi a0, s0, -48
-; RV32I-WITHFP-NEXT:    li a1, 9
-; RV32I-WITHFP-NEXT:    sw a1, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 4(sp)
-; RV32I-WITHFP-NEXT:    lui a0, 524272
-; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
-; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
-; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
-; RV32I-WITHFP-NEXT:    sw a0, -36(s0)
-; RV32I-WITHFP-NEXT:    li t0, 8
+; RV32I-WITHFP-NEXT:    addi a6, s0, -48
+; RV32I-WITHFP-NEXT:    li a7, 9
+; RV32I-WITHFP-NEXT:    lui t0, 524272
+; RV32I-WITHFP-NEXT:    li t1, 8
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 2
 ; RV32I-WITHFP-NEXT:    li a2, 3
 ; RV32I-WITHFP-NEXT:    li a3, 4
 ; RV32I-WITHFP-NEXT:    li a4, 5
 ; RV32I-WITHFP-NEXT:    li a5, 6
+; RV32I-WITHFP-NEXT:    sw a7, 0(sp)
+; RV32I-WITHFP-NEXT:    sw a6, 4(sp)
 ; RV32I-WITHFP-NEXT:    li a6, 7
+; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
+; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
+; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
+; RV32I-WITHFP-NEXT:    sw t0, -36(s0)
 ; RV32I-WITHFP-NEXT:    addi a7, s0, -24
-; RV32I-WITHFP-NEXT:    sw t0, -24(s0)
+; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -20(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -16(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -12(s0)
@@ -664,34 +664,34 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    li a0, 19
-; RV32I-FPELIM-NEXT:    li a1, 18
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 24(sp)
-; RV32I-FPELIM-NEXT:    li a0, 17
-; RV32I-FPELIM-NEXT:    li a1, 16
-; RV32I-FPELIM-NEXT:    li a2, 15
-; RV32I-FPELIM-NEXT:    sw a2, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 8(sp)
-; RV32I-FPELIM-NEXT:    sw zero, 12(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
-; RV32I-FPELIM-NEXT:    lui a0, 262153
-; RV32I-FPELIM-NEXT:    addi t0, a0, 491
-; RV32I-FPELIM-NEXT:    lui a0, 545260
-; RV32I-FPELIM-NEXT:    addi t1, a0, -1967
-; RV32I-FPELIM-NEXT:    lui a0, 964690
-; RV32I-FPELIM-NEXT:    addi t2, a0, -328
-; RV32I-FPELIM-NEXT:    lui a0, 335544
-; RV32I-FPELIM-NEXT:    addi t3, a0, 1311
-; RV32I-FPELIM-NEXT:    lui a0, 688509
-; RV32I-FPELIM-NEXT:    addi a5, a0, -2048
+; RV32I-FPELIM-NEXT:    li a5, 19
+; RV32I-FPELIM-NEXT:    li a6, 18
+; RV32I-FPELIM-NEXT:    li a7, 17
+; RV32I-FPELIM-NEXT:    li t0, 16
+; RV32I-FPELIM-NEXT:    li t1, 15
+; RV32I-FPELIM-NEXT:    lui t2, 262153
+; RV32I-FPELIM-NEXT:    lui t3, 545260
+; RV32I-FPELIM-NEXT:    lui t4, 964690
+; RV32I-FPELIM-NEXT:    lui t5, 335544
+; RV32I-FPELIM-NEXT:    lui t6, 688509
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 11
 ; RV32I-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32I-FPELIM-NEXT:    li a3, 12
 ; RV32I-FPELIM-NEXT:    li a4, 13
+; RV32I-FPELIM-NEXT:    sw a6, 20(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 24(sp)
 ; RV32I-FPELIM-NEXT:    li a6, 4
+; RV32I-FPELIM-NEXT:    sw t1, 0(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 8(sp)
+; RV32I-FPELIM-NEXT:    sw zero, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 16(sp)
 ; RV32I-FPELIM-NEXT:    li a7, 14
+; RV32I-FPELIM-NEXT:    addi t0, t2, 491
+; RV32I-FPELIM-NEXT:    addi t1, t3, -1967
+; RV32I-FPELIM-NEXT:    addi t2, t4, -328
+; RV32I-FPELIM-NEXT:    addi t3, t5, 1311
+; RV32I-FPELIM-NEXT:    addi a5, t6, -2048
 ; RV32I-FPELIM-NEXT:    sw t3, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw t2, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
@@ -707,34 +707,34 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    li a0, 19
-; RV32I-WITHFP-NEXT:    li a1, 18
-; RV32I-WITHFP-NEXT:    sw a1, 20(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 24(sp)
-; RV32I-WITHFP-NEXT:    li a0, 17
-; RV32I-WITHFP-NEXT:    li a1, 16
-; RV32I-WITHFP-NEXT:    li a2, 15
-; RV32I-WITHFP-NEXT:    sw a2, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a1, 8(sp)
-; RV32I-WITHFP-NEXT:    sw zero, 12(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 16(sp)
-; RV32I-WITHFP-NEXT:    lui a0, 262153
-; RV32I-WITHFP-NEXT:    addi t0, a0, 491
-; RV32I-WITHFP-NEXT:    lui a0, 545260
-; RV32I-WITHFP-NEXT:    addi t1, a0, -1967
-; RV32I-WITHFP-NEXT:    lui a0, 964690
-; RV32I-WITHFP-NEXT:    addi t2, a0, -328
-; RV32I-WITHFP-NEXT:    lui a0, 335544
-; RV32I-WITHFP-NEXT:    addi t3, a0, 1311
-; RV32I-WITHFP-NEXT:    lui a0, 688509
-; RV32I-WITHFP-NEXT:    addi a5, a0, -2048
+; RV32I-WITHFP-NEXT:    li a5, 19
+; RV32I-WITHFP-NEXT:    li a6, 18
+; RV32I-WITHFP-NEXT:    li a7, 17
+; RV32I-WITHFP-NEXT:    li t0, 16
+; RV32I-WITHFP-NEXT:    li t1, 15
+; RV32I-WITHFP-NEXT:    lui t2, 262153
+; RV32I-WITHFP-NEXT:    lui t3, 545260
+; RV32I-WITHFP-NEXT:    lui t4, 964690
+; RV32I-WITHFP-NEXT:    lui t5, 335544
+; RV32I-WITHFP-NEXT:    lui t6, 688509
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 11
 ; RV32I-WITHFP-NEXT:    addi a2, s0, -32
 ; RV32I-WITHFP-NEXT:    li a3, 12
 ; RV32I-WITHFP-NEXT:    li a4, 13
+; RV32I-WITHFP-NEXT:    sw a6, 20(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 24(sp)
 ; RV32I-WITHFP-NEXT:    li a6, 4
+; RV32I-WITHFP-NEXT:    sw t1, 0(sp)
+; RV32I-WITHFP-NEXT:    sw t0, 8(sp)
+; RV32I-WITHFP-NEXT:    sw zero, 12(sp)
+; RV32I-WITHFP-NEXT:    sw a7, 16(sp)
 ; RV32I-WITHFP-NEXT:    li a7, 14
+; RV32I-WITHFP-NEXT:    addi t0, t2, 491
+; RV32I-WITHFP-NEXT:    addi t1, t3, -1967
+; RV32I-WITHFP-NEXT:    addi t2, t4, -328
+; RV32I-WITHFP-NEXT:    addi t3, t5, 1311
+; RV32I-WITHFP-NEXT:    addi a5, t6, -2048
 ; RV32I-WITHFP-NEXT:    sw t3, -32(s0)
 ; RV32I-WITHFP-NEXT:    sw t2, -28(s0)
 ; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
index 1321413fbc57e8..7630d5b8f77ef4 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
@@ -97,21 +97,21 @@ define i32 @caller_double_in_gpr_exhausted_fprs() nounwind {
 ; RV32-ILP32D-NEXT:    addi sp, sp, -16
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_0)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI5_0)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_1)
-; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI5_1)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_2)
+; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI5_1)(a1)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_3)
 ; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI5_2)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_3)
-; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI5_3)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_4)
+; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI5_3)(a1)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_5)
 ; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI5_4)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_5)
-; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI5_5)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_6)
+; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI5_5)(a1)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_7)
 ; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI5_6)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_7)
-; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI5_7)(a0)
+; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI5_7)(a1)
 ; RV32-ILP32D-NEXT:    lui a1, 262688
 ; RV32-ILP32D-NEXT:    li a0, 0
 ; RV32-ILP32D-NEXT:    call callee_double_in_gpr_exhausted_fprs
@@ -149,20 +149,20 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32D-NEXT:    lui a1, 262816
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_0)
+; RV32-ILP32D-NEXT:    lui a2, %hi(.LCPI7_1)
+; RV32-ILP32D-NEXT:    lui a3, %hi(.LCPI7_2)
+; RV32-ILP32D-NEXT:    lui a4, %hi(.LCPI7_3)
+; RV32-ILP32D-NEXT:    lui a5, %hi(.LCPI7_4)
+; RV32-ILP32D-NEXT:    lui a6, %hi(.LCPI7_5)
+; RV32-ILP32D-NEXT:    lui a7, %hi(.LCPI7_6)
 ; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI7_0)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_1)
-; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI7_1)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_2)
-; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI7_2)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_3)
-; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI7_3)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_4)
-; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI7_4)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_5)
-; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI7_5)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_6)
-; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI7_6)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_7)
+; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI7_1)(a2)
+; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI7_2)(a3)
+; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI7_3)(a4)
+; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI7_4)(a5)
+; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI7_5)(a6)
+; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI7_6)(a7)
 ; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI7_7)(a0)
 ; RV32-ILP32D-NEXT:    li a0, 1
 ; RV32-ILP32D-NEXT:    li a2, 3
@@ -205,22 +205,22 @@ define i32 @caller_double_on_stack_exhausted_gprs_fprs() nounwind {
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32D-NEXT:    lui a1, 262816
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_0)
+; RV32-ILP32D-NEXT:    lui a2, %hi(.LCPI9_1)
+; RV32-ILP32D-NEXT:    lui a3, %hi(.LCPI9_2)
+; RV32-ILP32D-NEXT:    lui a4, %hi(.LCPI9_3)
+; RV32-ILP32D-NEXT:    lui a5, %hi(.LCPI9_4)
+; RV32-ILP32D-NEXT:    lui a6, %hi(.LCPI9_5)
+; RV32-ILP32D-NEXT:    lui a7, %hi(.LCPI9_6)
 ; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI9_0)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_1)
-; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI9_1)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_2)
-; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI9_2)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_3)
-; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI9_3)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_4)
-; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI9_4)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_5)
-; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI9_5)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_6)
-; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI9_6)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_7)
-; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI9_7)(a0)
+; RV32-ILP32D-NEXT:    lui t0, %hi(.LCPI9_7)
+; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI9_1)(a2)
 ; RV32-ILP32D-NEXT:    li a0, 1
+; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI9_2)(a3)
+; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI9_3)(a4)
+; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI9_4)(a5)
+; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI9_5)(a6)
+; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI9_6)(a7)
+; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI9_7)(t0)
 ; RV32-ILP32D-NEXT:    li a2, 3
 ; RV32-ILP32D-NEXT:    li a4, 5
 ; RV32-ILP32D-NEXT:    li a6, 7
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index 2b779cd34a8072..e16bed5400300b 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -694,39 +694,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    li a0, 18
-; ILP32E-FPELIM-NEXT:    li a1, 17
-; ILP32E-FPELIM-NEXT:    li a2, 16
-; ILP32E-FPELIM-NEXT:    lui a3, 262236
-; ILP32E-FPELIM-NEXT:    addi a3, a3, 655
-; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 377487
-; ILP32E-FPELIM-NEXT:    addi a0, a0, 1475
-; ILP32E-FPELIM-NEXT:    li a1, 15
-; ILP32E-FPELIM-NEXT:    li a2, 14
-; ILP32E-FPELIM-NEXT:    li a3, 4
-; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 262153
-; ILP32E-FPELIM-NEXT:    addi a6, a0, 491
-; ILP32E-FPELIM-NEXT:    lui a0, 545260
-; ILP32E-FPELIM-NEXT:    addi a7, a0, -1967
-; ILP32E-FPELIM-NEXT:    lui a0, 964690
-; ILP32E-FPELIM-NEXT:    addi t0, a0, -328
-; ILP32E-FPELIM-NEXT:    lui a0, 335544
-; ILP32E-FPELIM-NEXT:    addi t1, a0, 1311
-; ILP32E-FPELIM-NEXT:    lui a0, 688509
-; ILP32E-FPELIM-NEXT:    addi a5, a0, -2048
+; ILP32E-FPELIM-NEXT:    li a3, 18
+; ILP32E-FPELIM-NEXT:    li a4, 17
+; ILP32E-FPELIM-NEXT:    li a5, 16
+; ILP32E-FPELIM-NEXT:    lui a6, 262236
+; ILP32E-FPELIM-NEXT:    lui a7, 377487
+; ILP32E-FPELIM-NEXT:    li t0, 15
+; ILP32E-FPELIM-NEXT:    li t1, 14
+; ILP32E-FPELIM-NEXT:    li t2, 4
+; ILP32E-FPELIM-NEXT:    lui t3, 262153
+; ILP32E-FPELIM-NEXT:    lui t4, 545260
+; ILP32E-FPELIM-NEXT:    lui t5, 964690
+; ILP32E-FPELIM-NEXT:    lui t6, 335544
+; ILP32E-FPELIM-NEXT:    lui s2, 688509
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 11
 ; ILP32E-FPELIM-NEXT:    addi a2, sp, 32
+; ILP32E-FPELIM-NEXT:    addi a6, a6, 655
+; ILP32E-FPELIM-NEXT:    sw a6, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 28(sp)
 ; ILP32E-FPELIM-NEXT:    li a3, 12
+; ILP32E-FPELIM-NEXT:    addi a4, a7, 1475
+; ILP32E-FPELIM-NEXT:    sw t2, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw t1, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw t0, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 13
+; ILP32E-FPELIM-NEXT:    addi a6, t3, 491
+; ILP32E-FPELIM-NEXT:    addi a7, t4, -1967
+; ILP32E-FPELIM-NEXT:    addi t0, t5, -328
+; ILP32E-FPELIM-NEXT:    addi t1, t6, 1311
+; ILP32E-FPELIM-NEXT:    addi a5, s2, -2048
 ; ILP32E-FPELIM-NEXT:    sw t1, 32(sp)
 ; ILP32E-FPELIM-NEXT:    sw t0, 36(sp)
 ; ILP32E-FPELIM-NEXT:    sw a7, 40(sp)
@@ -753,39 +753,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    li a0, 18
-; ILP32E-WITHFP-NEXT:    li a1, 17
-; ILP32E-WITHFP-NEXT:    li a2, 16
-; ILP32E-WITHFP-NEXT:    lui a3, 262236
-; ILP32E-WITHFP-NEXT:    addi a3, a3, 655
-; ILP32E-WITHFP-NEXT:    sw a3, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 24(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 377487
-; ILP32E-WITHFP-NEXT:    addi a0, a0, 1475
-; ILP32E-WITHFP-NEXT:    li a1, 15
-; ILP32E-WITHFP-NEXT:    li a2, 14
-; ILP32E-WITHFP-NEXT:    li a3, 4
-; ILP32E-WITHFP-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 262153
-; ILP32E-WITHFP-NEXT:    addi a6, a0, 491
-; ILP32E-WITHFP-NEXT:    lui a0, 545260
-; ILP32E-WITHFP-NEXT:    addi a7, a0, -1967
-; ILP32E-WITHFP-NEXT:    lui a0, 964690
-; ILP32E-WITHFP-NEXT:    addi t0, a0, -328
-; ILP32E-WITHFP-NEXT:    lui a0, 335544
-; ILP32E-WITHFP-NEXT:    addi t1, a0, 1311
-; ILP32E-WITHFP-NEXT:    lui a0, 688509
-; ILP32E-WITHFP-NEXT:    addi a5, a0, -2048
+; ILP32E-WITHFP-NEXT:    li a3, 18
+; ILP32E-WITHFP-NEXT:    li a4, 17
+; ILP32E-WITHFP-NEXT:    li a5, 16
+; ILP32E-WITHFP-NEXT:    lui a6, 262236
+; ILP32E-WITHFP-NEXT:    lui a7, 377487
+; ILP32E-WITHFP-NEXT:    li t0, 15
+; ILP32E-WITHFP-NEXT:    li t1, 14
+; ILP32E-WITHFP-NEXT:    li t2, 4
+; ILP32E-WITHFP-NEXT:    lui t3, 262153
+; ILP32E-WITHFP-NEXT:    lui t4, 545260
+; ILP32E-WITHFP-NEXT:    lui t5, 964690
+; ILP32E-WITHFP-NEXT:    lui t6, 335544
+; ILP32E-WITHFP-NEXT:    lui s2, 688509
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 11
 ; ILP32E-WITHFP-NEXT:    addi a2, sp, 32
+; ILP32E-WITHFP-NEXT:    addi a6, a6, 655
+; ILP32E-WITHFP-NEXT:    sw a6, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 24(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 28(sp)
 ; ILP32E-WITHFP-NEXT:    li a3, 12
+; ILP32E-WITHFP-NEXT:    addi a4, a7, 1475
+; ILP32E-WITHFP-NEXT:    sw t2, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw t1, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw t0, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 13
+; ILP32E-WITHFP-NEXT:    addi a6, t3, 491
+; ILP32E-WITHFP-NEXT:    addi a7, t4, -1967
+; ILP32E-WITHFP-NEXT:    addi t0, t5, -328
+; ILP32E-WITHFP-NEXT:    addi t1, t6, 1311
+; ILP32E-WITHFP-NEXT:    addi a5, s2, -2048
 ; ILP32E-WITHFP-NEXT:    sw t1, 32(sp)
 ; ILP32E-WITHFP-NEXT:    sw t0, 36(sp)
 ; ILP32E-WITHFP-NEXT:    sw a7, 40(sp)
@@ -812,39 +812,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 18
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 17
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a3, 262236
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a3, a3, 655
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 16(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 20(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 24(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 377487
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, a0, 1475
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 15
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 14
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 262153
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, a0, 491
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 545260
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a7, a0, -1967
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 964690
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t0, a0, -328
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 335544
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t1, a0, 1311
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 688509
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, a0, -2048
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 18
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 17
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 16
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a6, 262236
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a7, 377487
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t0, 15
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 14
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t2, 4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t3, 262153
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t4, 545260
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t5, 964690
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t6, 335544
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui s2, 688509
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 11
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a2, sp, 32
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, a6, 655
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 16(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 20(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 28(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 12
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, a7, 1475
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t2, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 13
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, t3, 491
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a7, t4, -1967
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t0, t5, -328
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t1, t6, 1311
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, s2, -2048
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 32(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 36(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 40(sp)
@@ -867,39 +867,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 18
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 17
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a3, 262236
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a3, a3, 655
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 16(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 20(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 24(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 377487
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, a0, 1475
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 15
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 14
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 262153
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, a0, 491
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 545260
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a7, a0, -1967
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 964690
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t0, a0, -328
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 335544
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t1, a0, 1311
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 688509
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, a0, -2048
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 18
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 17
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 16
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a6, 262236
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a7, 377487
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t0, 15
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 14
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t2, 4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t3, 262153
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t4, 545260
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t5, 964690
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t6, 335544
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui s2, 688509
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 11
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a2, sp, 32
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, a6, 655
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 16(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 20(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 24(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 28(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 12
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, a7, 1475
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t2, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 13
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, t3, 491
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a7, t4, -1967
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t0, t5, -328
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t1, t6, 1311
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, s2, -2048
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 32(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 36(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 40(sp)
@@ -1157,12 +1157,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-FPELIM-NEXT:    srli a1, a1, 16
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a2
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a1
+; ILP32E-FPELIM-NEXT:    add a0, a0, a5
 ; ILP32E-FPELIM-NEXT:    xor a1, a4, t1
 ; ILP32E-FPELIM-NEXT:    xor a2, a3, t0
+; ILP32E-FPELIM-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-NEXT:    or a1, a2, a1
 ; ILP32E-FPELIM-NEXT:    seqz a1, a1
-; ILP32E-FPELIM-NEXT:    add a0, a0, a5
-; ILP32E-FPELIM-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a6
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-NEXT:    ret
@@ -1186,12 +1186,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-WITHFP-NEXT:    srli a1, a1, 16
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a2
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
+; ILP32E-WITHFP-NEXT:    add a0, a0, a5
 ; ILP32E-WITHFP-NEXT:    xor a1, a4, t1
 ; ILP32E-WITHFP-NEXT:    xor a2, a3, t0
+; ILP32E-WITHFP-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-NEXT:    or a1, a2, a1
 ; ILP32E-WITHFP-NEXT:    seqz a1, a1
-; ILP32E-WITHFP-NEXT:    add a0, a0, a5
-; ILP32E-WITHFP-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a6
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
@@ -1214,12 +1214,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    srli a1, a1, 16
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a5
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a4, t1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a2, a3, t0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a2, a1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a1, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a5
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a6
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
@@ -1241,12 +1241,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    srli a1, a1, 16
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a5
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a4, t1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a2, a3, t0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a2, a1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a1, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a5
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a6
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
@@ -1386,21 +1386,21 @@ define i32 @caller_many_scalars() {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-FPELIM-LABEL: callee_large_scalars:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-NEXT:    lw a4, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a2, 0(a1)
+; ILP32E-FPELIM-NEXT:    lw a3, 4(a1)
+; ILP32E-FPELIM-NEXT:    lw a4, 8(a1)
+; ILP32E-FPELIM-NEXT:    lw a1, 12(a1)
 ; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-NEXT:    xor a1, a2, a6
-; ILP32E-FPELIM-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-NEXT:    or a0, a0, a3
+; ILP32E-FPELIM-NEXT:    lw a6, 4(a0)
+; ILP32E-FPELIM-NEXT:    lw a7, 8(a0)
+; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32E-FPELIM-NEXT:    xor a1, a5, a1
+; ILP32E-FPELIM-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-NEXT:    xor a0, a0, a2
+; ILP32E-FPELIM-NEXT:    or a1, a3, a1
+; ILP32E-FPELIM-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-NEXT:    or a0, a0, a1
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1414,21 +1414,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-NEXT:    lw a4, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a2, 0(a1)
+; ILP32E-WITHFP-NEXT:    lw a3, 4(a1)
+; ILP32E-WITHFP-NEXT:    lw a4, 8(a1)
+; ILP32E-WITHFP-NEXT:    lw a1, 12(a1)
 ; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-NEXT:    xor a1, a2, a6
-; ILP32E-WITHFP-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-NEXT:    or a0, a0, a3
+; ILP32E-WITHFP-NEXT:    lw a6, 4(a0)
+; ILP32E-WITHFP-NEXT:    lw a7, 8(a0)
+; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32E-WITHFP-NEXT:    xor a1, a5, a1
+; ILP32E-WITHFP-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-NEXT:    xor a0, a0, a2
+; ILP32E-WITHFP-NEXT:    or a1, a3, a1
+; ILP32E-WITHFP-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-NEXT:    or a0, a0, a1
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
@@ -1441,21 +1441,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 8(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a2, a6
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 4(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 8(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 0(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a5, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a2
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a3, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1467,21 +1467,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 8(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a2, a6
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 4(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 8(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 0(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a5, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a2
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a3, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
@@ -1503,13 +1503,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 48
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    lui a0, 524272
+; ILP32E-FPELIM-NEXT:    lui a1, 524272
+; ILP32E-FPELIM-NEXT:    li a2, 1
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 24
 ; ILP32E-FPELIM-NEXT:    sw zero, 0(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    li a2, 1
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
 ; ILP32E-FPELIM-NEXT:    mv a1, sp
 ; ILP32E-FPELIM-NEXT:    sw a2, 24(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 28(sp)
@@ -1537,13 +1537,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 48
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    lui a0, 524272
+; ILP32E-WITHFP-NEXT:    lui a1, 524272
+; ILP32E-WITHFP-NEXT:    li a2, 1
+; ILP32E-WITHFP-NEXT:    addi a0, sp, 24
 ; ILP32E-WITHFP-NEXT:    sw zero, 0(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    li a2, 1
-; ILP32E-WITHFP-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-NEXT:    sw a1, 12(sp)
 ; ILP32E-WITHFP-NEXT:    mv a1, sp
 ; ILP32E-WITHFP-NEXT:    sw a2, 24(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 28(sp)
@@ -1571,13 +1571,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 48
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a1, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 24
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 0(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    mv a1, sp
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 24(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 28(sp)
@@ -1601,13 +1601,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 48
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a1, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 24
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 0(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    mv a1, sp
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 24(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 28(sp)
@@ -1630,23 +1630,23 @@ define i32 @caller_large_scalars() {
 define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i128 %h, i32 %i, fp128 %j) {
 ; ILP32E-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    lw a0, 4(sp)
-; ILP32E-FPELIM-NEXT:    lw a1, 12(sp)
+; ILP32E-FPELIM-NEXT:    lw a0, 12(sp)
+; ILP32E-FPELIM-NEXT:    lw a1, 4(sp)
 ; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
 ; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-NEXT:    lw a4, 12(a1)
-; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-NEXT:    lw a4, 8(a0)
+; ILP32E-FPELIM-NEXT:    lw a0, 12(a0)
+; ILP32E-FPELIM-NEXT:    lw a5, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a6, 4(a1)
+; ILP32E-FPELIM-NEXT:    lw a7, 8(a1)
+; ILP32E-FPELIM-NEXT:    lw a1, 0(a1)
+; ILP32E-FPELIM-NEXT:    xor a0, a5, a0
+; ILP32E-FPELIM-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-NEXT:    xor a1, a1, a2
+; ILP32E-FPELIM-NEXT:    or a0, a3, a0
+; ILP32E-FPELIM-NEXT:    or a1, a1, a4
 ; ILP32E-FPELIM-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1660,23 +1660,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-NEXT:    lw a0, 4(s0)
-; ILP32E-WITHFP-NEXT:    lw a1, 12(s0)
+; ILP32E-WITHFP-NEXT:    lw a0, 12(s0)
+; ILP32E-WITHFP-NEXT:    lw a1, 4(s0)
 ; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
 ; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-NEXT:    lw a4, 12(a1)
-; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-NEXT:    lw a4, 8(a0)
+; ILP32E-WITHFP-NEXT:    lw a0, 12(a0)
+; ILP32E-WITHFP-NEXT:    lw a5, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a6, 4(a1)
+; ILP32E-WITHFP-NEXT:    lw a7, 8(a1)
+; ILP32E-WITHFP-NEXT:    lw a1, 0(a1)
+; ILP32E-WITHFP-NEXT:    xor a0, a5, a0
+; ILP32E-WITHFP-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-NEXT:    xor a1, a1, a2
+; ILP32E-WITHFP-NEXT:    or a0, a3, a0
+; ILP32E-WITHFP-NEXT:    or a1, a1, a4
 ; ILP32E-WITHFP-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
@@ -1689,23 +1689,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 8(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 4(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 8(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 0(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a5, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a1, a2
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a3, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a1, a4
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1717,23 +1717,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 4(s0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 4(s0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 8(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 4(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 8(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 0(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a5, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a1, a2
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a3, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a1, a4
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
@@ -1755,27 +1755,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 16
-; ILP32E-FPELIM-NEXT:    li a1, 9
-; ILP32E-FPELIM-NEXT:    addi a2, sp, 40
-; ILP32E-FPELIM-NEXT:    li a3, 7
-; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 524272
-; ILP32E-FPELIM-NEXT:    sw zero, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw zero, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw zero, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-NEXT:    li a6, 8
+; ILP32E-FPELIM-NEXT:    addi a4, sp, 16
+; ILP32E-FPELIM-NEXT:    li a5, 9
+; ILP32E-FPELIM-NEXT:    addi a6, sp, 40
+; ILP32E-FPELIM-NEXT:    li a7, 7
+; ILP32E-FPELIM-NEXT:    lui t0, 524272
+; ILP32E-FPELIM-NEXT:    li t1, 8
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 2
 ; ILP32E-FPELIM-NEXT:    li a2, 3
 ; ILP32E-FPELIM-NEXT:    li a3, 4
+; ILP32E-FPELIM-NEXT:    sw a7, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw a6, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 5
+; ILP32E-FPELIM-NEXT:    sw zero, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw zero, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw zero, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw t0, 28(sp)
 ; ILP32E-FPELIM-NEXT:    li a5, 6
-; ILP32E-FPELIM-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-NEXT:    sw t1, 40(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 44(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 48(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 52(sp)
@@ -1801,27 +1801,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    addi a0, sp, 16
-; ILP32E-WITHFP-NEXT:    li a1, 9
-; ILP32E-WITHFP-NEXT:    addi a2, sp, 40
-; ILP32E-WITHFP-NEXT:    li a3, 7
-; ILP32E-WITHFP-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 524272
-; ILP32E-WITHFP-NEXT:    sw zero, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw zero, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw zero, 24(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-NEXT:    li a6, 8
+; ILP32E-WITHFP-NEXT:    addi a4, sp, 16
+; ILP32E-WITHFP-NEXT:    li a5, 9
+; ILP32E-WITHFP-NEXT:    addi a6, sp, 40
+; ILP32E-WITHFP-NEXT:    li a7, 7
+; ILP32E-WITHFP-NEXT:    lui t0, 524272
+; ILP32E-WITHFP-NEXT:    li t1, 8
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 2
 ; ILP32E-WITHFP-NEXT:    li a2, 3
 ; ILP32E-WITHFP-NEXT:    li a3, 4
+; ILP32E-WITHFP-NEXT:    sw a7, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw a6, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 5
+; ILP32E-WITHFP-NEXT:    sw zero, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw zero, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw zero, 24(sp)
+; ILP32E-WITHFP-NEXT:    sw t0, 28(sp)
 ; ILP32E-WITHFP-NEXT:    li a5, 6
-; ILP32E-WITHFP-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-NEXT:    sw t1, 40(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 44(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 48(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 52(sp)
@@ -1847,27 +1847,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 9
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a2, sp, 40
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 524272
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a6, 8
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, sp, 16
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 9
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, sp, 40
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a7, 7
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t0, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 8
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 5
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 28(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 6
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 40(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 44(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 48(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 52(sp)
@@ -1889,27 +1889,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 9
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a2, sp, 40
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 524272
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a6, 8
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, sp, 16
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 9
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, sp, 40
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a7, 7
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t0, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 5
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 28(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 6
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 40(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 44(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 48(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 52(sp)
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
index b0d60a7aaa235f..dabd2a7ce9a73d 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
@@ -55,11 +55,11 @@ define i32 @caller_float_in_fpr_exhausted_gprs() nounwind {
 ; RV32-ILP32FD-NEXT:    addi sp, sp, -16
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32FD-NEXT:    li a1, 5
-; RV32-ILP32FD-NEXT:    lui a0, 265216
-; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
+; RV32-ILP32FD-NEXT:    lui a3, 265216
 ; RV32-ILP32FD-NEXT:    li a0, 1
 ; RV32-ILP32FD-NEXT:    li a2, 2
 ; RV32-ILP32FD-NEXT:    li a4, 3
+; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a3
 ; RV32-ILP32FD-NEXT:    li a6, 4
 ; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    li a1, 0
@@ -96,21 +96,21 @@ define i32 @caller_float_in_gpr_exhausted_fprs() nounwind {
 ; RV32-ILP32FD-NEXT:    addi sp, sp, -16
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32FD-NEXT:    lui a0, 260096
+; RV32-ILP32FD-NEXT:    lui a1, 262144
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
-; RV32-ILP32FD-NEXT:    lui a0, 262144
-; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a0
 ; RV32-ILP32FD-NEXT:    lui a0, 263168
+; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a1
+; RV32-ILP32FD-NEXT:    lui a1, 264192
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a0
-; RV32-ILP32FD-NEXT:    lui a0, 264192
-; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a0
 ; RV32-ILP32FD-NEXT:    lui a0, 264704
+; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a1
+; RV32-ILP32FD-NEXT:    lui a1, 265216
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a0
-; RV32-ILP32FD-NEXT:    lui a0, 265216
-; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a0
 ; RV32-ILP32FD-NEXT:    lui a0, 265728
+; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a1
+; RV32-ILP32FD-NEXT:    lui a1, 266240
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266240
-; RV32-ILP32FD-NEXT:    fmv.w.x fa7, a0
+; RV32-ILP32FD-NEXT:    fmv.w.x fa7, a1
 ; RV32-ILP32FD-NEXT:    lui a0, 266496
 ; RV32-ILP32FD-NEXT:    call callee_float_in_gpr_exhausted_fprs
 ; RV32-ILP32FD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -143,24 +143,24 @@ define i32 @caller_float_on_stack_exhausted_gprs_fprs() nounwind {
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32FD-NEXT:    lui a1, 267520
 ; RV32-ILP32FD-NEXT:    lui a0, 262144
+; RV32-ILP32FD-NEXT:    lui a2, 264192
+; RV32-ILP32FD-NEXT:    lui a3, 265216
+; RV32-ILP32FD-NEXT:    lui a4, 266240
+; RV32-ILP32FD-NEXT:    lui a5, 266496
+; RV32-ILP32FD-NEXT:    lui a6, 266752
+; RV32-ILP32FD-NEXT:    lui a7, 267008
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
-; RV32-ILP32FD-NEXT:    lui a0, 264192
-; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a0
-; RV32-ILP32FD-NEXT:    lui a0, 265216
-; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266240
-; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266496
-; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266752
-; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a0
-; RV32-ILP32FD-NEXT:    lui a0, 267008
-; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a0
-; RV32-ILP32FD-NEXT:    lui a0, 267264
-; RV32-ILP32FD-NEXT:    fmv.w.x fa7, a0
+; RV32-ILP32FD-NEXT:    lui t0, 267264
+; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a2
 ; RV32-ILP32FD-NEXT:    li a0, 1
+; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a3
 ; RV32-ILP32FD-NEXT:    li a2, 3
+; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a4
 ; RV32-ILP32FD-NEXT:    li a4, 5
+; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a5
+; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a6
+; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a7
+; RV32-ILP32FD-NEXT:    fmv.w.x fa7, t0
 ; RV32-ILP32FD-NEXT:    li a6, 7
 ; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    li a1, 0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index cbd2cef981d71f..746b71a08a30ba 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -52,15 +52,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i128 %d, i32 %e, i32 %f,
 ; RV64I-NEXT:    ld t1, 0(sp)
 ; RV64I-NEXT:    andi a0, a0, 255
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    xor a3, a3, a7
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    xor a1, a4, t1
-; RV64I-NEXT:    xor a2, a3, a7
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    add a0, a0, a5
+; RV64I-NEXT:    xor a1, a4, t1
 ; RV64I-NEXT:    add a0, a0, a6
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    add a0, a0, t0
 ; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a0)
-; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a1)
+; RV64I-NEXT:    ld a2, 0(a1)
+; RV64I-NEXT:    ld a3, 8(a1)
+; RV64I-NEXT:    ld a4, 16(a1)
+; RV64I-NEXT:    ld a1, 24(a1)
 ; RV64I-NEXT:    ld a5, 24(a0)
-; RV64I-NEXT:    ld a6, 0(a1)
-; RV64I-NEXT:    ld a7, 8(a1)
-; RV64I-NEXT:    ld a1, 16(a1)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a4, a5, a4
-; RV64I-NEXT:    xor a3, a3, a7
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a1, a2, a6
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    ld a6, 8(a0)
+; RV64I-NEXT:    ld a7, 16(a0)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    xor a1, a5, a1
+; RV64I-NEXT:    xor a3, a6, a3
+; RV64I-NEXT:    xor a4, a7, a4
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -133,15 +133,15 @@ define i64 @caller_large_scalars() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -80
 ; RV64I-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    li a0, 2
-; RV64I-NEXT:    sd a0, 0(sp)
+; RV64I-NEXT:    li a2, 2
+; RV64I-NEXT:    li a3, 1
+; RV64I-NEXT:    addi a0, sp, 32
+; RV64I-NEXT:    mv a1, sp
+; RV64I-NEXT:    sd a2, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    li a2, 1
-; RV64I-NEXT:    addi a0, sp, 32
-; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    sd a2, 32(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
@@ -163,18 +163,18 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-NEXT:    ld a0, 8(sp)
 ; RV64I-NEXT:    ld a1, 0(a7)
 ; RV64I-NEXT:    ld a2, 8(a7)
-; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a3, 16(a7)
 ; RV64I-NEXT:    ld a4, 24(a7)
-; RV64I-NEXT:    ld a5, 0(a0)
+; RV64I-NEXT:    ld a5, 24(a0)
 ; RV64I-NEXT:    ld a6, 8(a0)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a3, a4, a3
+; RV64I-NEXT:    ld a7, 16(a0)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    xor a4, a4, a5
 ; RV64I-NEXT:    xor a2, a2, a6
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a1, a5
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    xor a3, a3, a7
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
@@ -188,16 +188,10 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -96
 ; RV64I-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a0, sp, 16
-; RV64I-NEXT:    li a1, 9
-; RV64I-NEXT:    sd a1, 0(sp)
-; RV64I-NEXT:    sd a0, 8(sp)
-; RV64I-NEXT:    li a0, 10
-; RV64I-NEXT:    sd a0, 16(sp)
-; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd zero, 32(sp)
-; RV64I-NEXT:    sd zero, 40(sp)
-; RV64I-NEXT:    li t0, 8
+; RV64I-NEXT:    addi a7, sp, 16
+; RV64I-NEXT:    li t0, 9
+; RV64I-NEXT:    li t1, 10
+; RV64I-NEXT:    li t2, 8
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
@@ -205,8 +199,14 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
 ; RV64I-NEXT:    li a6, 7
+; RV64I-NEXT:    sd t0, 0(sp)
+; RV64I-NEXT:    sd a7, 8(sp)
 ; RV64I-NEXT:    addi a7, sp, 48
-; RV64I-NEXT:    sd t0, 48(sp)
+; RV64I-NEXT:    sd t1, 16(sp)
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd t2, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
 ; RV64I-NEXT:    sd zero, 64(sp)
 ; RV64I-NEXT:    sd zero, 72(sp)
@@ -356,24 +356,24 @@ define void @caller_aligned_stack() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    li a0, 12
-; RV64I-NEXT:    li a1, 11
-; RV64I-NEXT:    sd a1, 40(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
-; RV64I-NEXT:    li a6, 10
-; RV64I-NEXT:    li t0, 9
-; RV64I-NEXT:    li t1, 8
+; RV64I-NEXT:    li a6, 12
+; RV64I-NEXT:    li a7, 11
+; RV64I-NEXT:    li t0, 10
+; RV64I-NEXT:    li t1, 9
+; RV64I-NEXT:    li t2, 8
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
+; RV64I-NEXT:    sd a7, 40(sp)
+; RV64I-NEXT:    sd a6, 48(sp)
 ; RV64I-NEXT:    li a7, 7
-; RV64I-NEXT:    sd t1, 0(sp)
-; RV64I-NEXT:    sd t0, 16(sp)
+; RV64I-NEXT:    sd t2, 0(sp)
+; RV64I-NEXT:    sd t1, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd a6, 32(sp)
+; RV64I-NEXT:    sd t0, 32(sp)
 ; RV64I-NEXT:    li a6, 0
 ; RV64I-NEXT:    call callee_aligned_stack
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
@@ -480,15 +480,15 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, 1
 ; RV64I-NEXT:    li a2, 2
+; RV64I-NEXT:    li a3, 3
+; RV64I-NEXT:    li a4, 4
 ; RV64I-NEXT:    sw a1, 0(a0)
 ; RV64I-NEXT:    sw zero, 4(a0)
 ; RV64I-NEXT:    sw a2, 8(a0)
 ; RV64I-NEXT:    sw zero, 12(a0)
-; RV64I-NEXT:    li a1, 3
-; RV64I-NEXT:    li a2, 4
-; RV64I-NEXT:    sw a1, 16(a0)
+; RV64I-NEXT:    sw a3, 16(a0)
 ; RV64I-NEXT:    sw zero, 20(a0)
-; RV64I-NEXT:    sw a2, 24(a0)
+; RV64I-NEXT:    sw a4, 24(a0)
 ; RV64I-NEXT:    sw zero, 28(a0)
 ; RV64I-NEXT:    ret
   store i64 1, ptr %agg.result, align 4
diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
index b26bd7b889807a..6608874286e346 100644
--- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
@@ -87,8 +87,8 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si
 ; NOCMOV-NEXT:    seqz a4, a4
 ; NOCMOV-NEXT:    addi a4, a4, -1
 ; NOCMOV-NEXT:    and a1, a1, a4
-; NOCMOV-NEXT:    xor a0, a0, a1
 ; NOCMOV-NEXT:    and a3, a3, a4
+; NOCMOV-NEXT:    xor a0, a0, a1
 ; NOCMOV-NEXT:    xor a2, a2, a3
 ; NOCMOV-NEXT:    addw a0, a0, a2
 ; NOCMOV-NEXT:    ret
@@ -96,16 +96,16 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si
 ; CMOV-LABEL: test3:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    xor a1, a1, a0
+; CMOV-NEXT:    xor a3, a3, a2
 ; CMOV-NEXT:    bnez a4, .LBB2_2
 ; CMOV-NEXT:  # %bb.1:
 ; CMOV-NEXT:    mv a1, a0
 ; CMOV-NEXT:  .LBB2_2:
-; CMOV-NEXT:    xor a0, a2, a3
 ; CMOV-NEXT:    bnez a4, .LBB2_4
 ; CMOV-NEXT:  # %bb.3:
-; CMOV-NEXT:    mv a0, a2
+; CMOV-NEXT:    mv a3, a2
 ; CMOV-NEXT:  .LBB2_4:
-; CMOV-NEXT:    addw a0, a0, a1
+; CMOV-NEXT:    addw a0, a1, a3
 ; CMOV-NEXT:    ret
 ;
 ; SHORT_FORWARD-LABEL: test3:
diff --git a/llvm/test/CodeGen/RISCV/compress.ll b/llvm/test/CodeGen/RISCV/compress.ll
index 8fb520fac41ee0..c8803773d76306 100644
--- a/llvm/test/CodeGen/RISCV/compress.ll
+++ b/llvm/test/CodeGen/RISCV/compress.ll
@@ -32,9 +32,9 @@
 define i32 @simple_arith(i32 %a, i32 %b) #0 {
 ; RV32IC-LABEL: <simple_arith>:
 ; RV32IC:         addi a2, a0, 0x1
+; RV32IC-NEXT:    c.srai a1, 0x9
 ; RV32IC-NEXT:    c.andi a2, 0xb
 ; RV32IC-NEXT:    c.slli a2, 0x7
-; RV32IC-NEXT:    c.srai a1, 0x9
 ; RV32IC-NEXT:    sub a0, a1, a0
 ; RV32IC-NEXT:    c.add a0, a2
 ; RV32IC-NEXT:    c.jr ra
diff --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll
index 1a661fddacfa05..dc81c13bfb6a35 100644
--- a/llvm/test/CodeGen/RISCV/condbinops.ll
+++ b/llvm/test/CodeGen/RISCV/condbinops.ll
@@ -453,19 +453,19 @@ define i64 @shl64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-LABEL: shl64:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    srli a3, a0, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    sll a3, a0, a2
+; RV32ZICOND-NEXT:    sll a0, a0, a2
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
-; RV32ZICOND-NEXT:    slti a4, a4, 0
-; RV32ZICOND-NEXT:    czero.nez a5, a3, a4
 ; RV32ZICOND-NEXT:    sll a1, a1, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    srli a0, a0, 1
-; RV32ZICOND-NEXT:    srl a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    czero.eqz a1, a0, a4
-; RV32ZICOND-NEXT:    or a1, a1, a5
-; RV32ZICOND-NEXT:    czero.eqz a0, a3, a4
+; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srl a2, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a0, a4
+; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
+; RV32ZICOND-NEXT:    or a1, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: shl64:
@@ -527,22 +527,22 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-LABEL: ashr64:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    slli a3, a1, 1
+; RV32ZICOND-NEXT:    srai a5, a1, 31
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    sra a3, a1, a2
+; RV32ZICOND-NEXT:    sra a1, a1, a2
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
-; RV32ZICOND-NEXT:    slti a4, a4, 0
-; RV32ZICOND-NEXT:    czero.nez a5, a3, a4
 ; RV32ZICOND-NEXT:    srl a0, a0, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slli a6, a1, 1
-; RV32ZICOND-NEXT:    sll a2, a6, a2
+; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a1, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
+; RV32ZICOND-NEXT:    czero.nez a2, a5, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a5
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
-; RV32ZICOND-NEXT:    srai a1, a1, 31
-; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a2, a1
+; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    or a1, a1, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: ashr64:
@@ -604,19 +604,19 @@ define i64 @lshr64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-LABEL: lshr64:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    slli a3, a1, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    srl a3, a1, a2
+; RV32ZICOND-NEXT:    srl a1, a1, a2
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
-; RV32ZICOND-NEXT:    slti a4, a4, 0
-; RV32ZICOND-NEXT:    czero.nez a5, a3, a4
 ; RV32ZICOND-NEXT:    srl a0, a0, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slli a1, a1, 1
-; RV32ZICOND-NEXT:    sll a1, a1, a2
-; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a1, a4
+; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a5
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a4
+; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: lshr64:
@@ -636,10 +636,10 @@ define i64 @sub64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    sltu a5, a0, a2
 ; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a5
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -669,10 +669,10 @@ define i64 @sub64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    sltu a5, a0, a2
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a4, a0, a2
 ; RV32ZICOND-NEXT:    sub a1, a1, a3
-; RV32ZICOND-NEXT:    sub a1, a1, a5
+; RV32ZICOND-NEXT:    sub a1, a1, a4
 ; RV32ZICOND-NEXT:    sub a0, a0, a2
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -728,8 +728,8 @@ define i64 @and64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-NEXT:    and a3, a1, a3
 ; RV32ZICOND-NEXT:    and a2, a0, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a2, a0
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    or a0, a2, a0
 ; RV32ZICOND-NEXT:    or a1, a3, a1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -752,8 +752,8 @@ define i64 @add64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a3, a4, a3
-; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    add a1, a1, a0
@@ -786,8 +786,8 @@ define i64 @add64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
-; RV32ZICOND-NEXT:    add a1, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    add a1, a1, a3
 ; RV32ZICOND-NEXT:    add a2, a0, a2
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
 ; RV32ZICOND-NEXT:    add a1, a1, a0
@@ -812,8 +812,8 @@ define i64 @or64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -843,9 +843,9 @@ define i64 @or64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or64:
@@ -865,8 +865,8 @@ define i64 @xor64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -896,9 +896,9 @@ define i64 @xor64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
 ; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
-; RV32ZICOND-NEXT:    xor a1, a1, a2
+; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor64:
diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index fbc9bc39942dd3..6c2ba493ffcd57 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -208,8 +208,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a4, a0, a4
-; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a1, a2, a1
@@ -225,8 +225,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
 ; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
@@ -247,8 +247,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
-; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a3, a0
+; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    add a0, a1, a0
 ; RV32ZICOND-NEXT:    sltu a1, a0, a1
 ; RV32ZICOND-NEXT:    add a1, a2, a1
@@ -269,11 +269,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a2, a4, a2
 ; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a1, a4, a2
 ; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    sltu a1, a0, a3
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add2:
@@ -286,11 +286,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
-; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    sltu a2, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: add2:
@@ -308,11 +308,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
-; RV32ZICOND-NEXT:    add a2, a4, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV32ZICOND-NEXT:    add a1, a4, a2
 ; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a1, a0, a3
-; RV32ZICOND-NEXT:    add a1, a2, a1
+; RV32ZICOND-NEXT:    sltu a2, a0, a3
+; RV32ZICOND-NEXT:    add a1, a1, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: add2:
@@ -330,8 +330,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a4, a0, a4
-; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a1, a2, a1
@@ -347,8 +347,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add3:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
 ; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
@@ -369,8 +369,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add3:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
-; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.nez a0, a3, a0
+; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    add a0, a1, a0
 ; RV32ZICOND-NEXT:    sltu a1, a0, a1
 ; RV32ZICOND-NEXT:    add a1, a2, a1
@@ -391,11 +391,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a2, a4, a2
 ; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a1, a4, a2
 ; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    sltu a1, a0, a3
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add4:
@@ -408,11 +408,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
-; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    sltu a2, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: add4:
@@ -430,11 +430,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add4:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZICOND-NEXT:    add a2, a4, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32ZICOND-NEXT:    add a1, a4, a2
 ; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a1, a0, a3
-; RV32ZICOND-NEXT:    add a1, a2, a1
+; RV32ZICOND-NEXT:    sltu a2, a0, a3
+; RV32ZICOND-NEXT:    add a1, a1, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: add4:
@@ -452,10 +452,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a0, a1, a3
 ; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
@@ -470,10 +470,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: sub1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a4, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
@@ -493,10 +493,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: sub1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
-; RV32ZICOND-NEXT:    sltu a5, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a0, a4, a0
+; RV32ZICOND-NEXT:    sltu a4, a1, a3
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    sub a2, a2, a5
+; RV32ZICOND-NEXT:    sub a2, a2, a4
 ; RV32ZICOND-NEXT:    sub a0, a1, a3
 ; RV32ZICOND-NEXT:    mv a1, a2
 ; RV32ZICOND-NEXT:    ret
@@ -516,10 +516,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a0, a1, a3
 ; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
@@ -534,10 +534,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: sub2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a4, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
@@ -557,10 +557,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: sub2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
-; RV32ZICOND-NEXT:    sltu a5, a1, a3
 ; RV32ZICOND-NEXT:    czero.nez a0, a4, a0
+; RV32ZICOND-NEXT:    sltu a4, a1, a3
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    sub a2, a2, a5
+; RV32ZICOND-NEXT:    sub a2, a2, a4
 ; RV32ZICOND-NEXT:    sub a0, a1, a3
 ; RV32ZICOND-NEXT:    mv a1, a2
 ; RV32ZICOND-NEXT:    ret
@@ -578,11 +578,11 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or1:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    or a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or1:
@@ -595,10 +595,9 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or1:
@@ -616,10 +615,9 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
-; RV32ZICOND-NEXT:    or a3, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or1:
@@ -635,11 +633,11 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a0, a3, a1
+; RV32I-NEXT:    or a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or2:
@@ -652,10 +650,9 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or2:
@@ -673,10 +670,9 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or2:
@@ -692,11 +688,11 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    or a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or3:
@@ -709,10 +705,9 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or3:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or3:
@@ -730,10 +725,9 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or3:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
-; RV32ZICOND-NEXT:    or a3, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or3:
@@ -749,11 +743,11 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or4:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a0, a3, a1
+; RV32I-NEXT:    or a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or4:
@@ -766,10 +760,9 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or4:
@@ -787,10 +780,9 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or4:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or4:
@@ -806,11 +798,11 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor1:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    xor a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    xor a1, a2, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    xor a0, a1, a3
+; RV32I-NEXT:    xor a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor1:
@@ -823,10 +815,9 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor1:
@@ -844,10 +835,9 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
-; RV32ZICOND-NEXT:    xor a3, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
-; RV32ZICOND-NEXT:    xor a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
+; RV32ZICOND-NEXT:    xor a0, a1, a3
+; RV32ZICOND-NEXT:    xor a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor1:
@@ -863,11 +853,11 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    xor a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    xor a1, a4, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    xor a0, a3, a1
+; RV32I-NEXT:    xor a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor2:
@@ -880,10 +870,9 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor2:
@@ -901,10 +890,9 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    xor a3, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a2, a0
-; RV32ZICOND-NEXT:    xor a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    xor a0, a3, a1
+; RV32ZICOND-NEXT:    xor a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor2:
@@ -920,11 +908,11 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    xor a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    xor a1, a2, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    xor a0, a1, a3
+; RV32I-NEXT:    xor a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor3:
@@ -937,10 +925,9 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor3:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor3:
@@ -958,10 +945,9 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor3:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
-; RV32ZICOND-NEXT:    xor a3, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    xor a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    xor a0, a1, a3
+; RV32ZICOND-NEXT:    xor a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor3:
@@ -977,11 +963,11 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor4:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    xor a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    xor a1, a4, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    xor a0, a3, a1
+; RV32I-NEXT:    xor a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor4:
@@ -994,10 +980,9 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor4:
@@ -1015,10 +1000,9 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor4:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a0
-; RV32ZICOND-NEXT:    xor a3, a3, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a2, a0
-; RV32ZICOND-NEXT:    xor a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    xor a0, a3, a1
+; RV32ZICOND-NEXT:    xor a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor4:
@@ -1057,10 +1041,9 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and1:
@@ -1082,10 +1065,9 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    and a4, a2, a4
 ; RV32ZICOND-NEXT:    and a3, a1, a3
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and1:
@@ -1122,13 +1104,12 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: and2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and2:
@@ -1147,13 +1128,12 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: and2:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    and a5, a2, a4
+; RV32ZICOND-NEXT:    and a2, a2, a4
 ; RV32ZICOND-NEXT:    and a1, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a0
-; RV32ZICOND-NEXT:    or a2, a1, a2
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a5, a1
-; RV32ZICOND-NEXT:    mv a0, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and2:
@@ -1193,10 +1173,9 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and3:
@@ -1218,10 +1197,9 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    and a4, a2, a4
 ; RV32ZICOND-NEXT:    and a3, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and3:
@@ -1258,13 +1236,12 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: and4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and4:
@@ -1283,13 +1260,12 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: and4:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    and a5, a2, a4
+; RV32ZICOND-NEXT:    and a2, a2, a4
 ; RV32ZICOND-NEXT:    and a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a0
-; RV32ZICOND-NEXT:    or a2, a1, a2
-; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a5, a1
-; RV32ZICOND-NEXT:    mv a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and4:
@@ -1328,11 +1304,10 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: basic:
@@ -1352,11 +1327,10 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    czero.eqz a0, a2, a0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: basic:
@@ -1397,13 +1371,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq:
@@ -1425,13 +1399,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq:
@@ -1474,13 +1448,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne:
@@ -1502,13 +1476,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne:
@@ -1555,16 +1529,16 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setgt:
@@ -1586,16 +1560,16 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setgt:
@@ -1642,16 +1616,16 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setge:
@@ -1673,16 +1647,16 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setge:
@@ -1729,16 +1703,16 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setlt:
@@ -1760,16 +1734,16 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setlt:
@@ -1816,16 +1790,16 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setle:
@@ -1847,16 +1821,16 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setle:
@@ -1903,16 +1877,16 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setugt:
@@ -1934,16 +1908,16 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setugt:
@@ -1990,16 +1964,16 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setuge:
@@ -2021,16 +1995,16 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setuge:
@@ -2077,16 +2051,16 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setult:
@@ -2108,16 +2082,16 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setult:
@@ -2164,16 +2138,16 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setule:
@@ -2195,16 +2169,16 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setule:
@@ -2243,13 +2217,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: seteq_zero:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_zero:
@@ -2267,13 +2241,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: seteq_zero:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_zero:
@@ -2311,13 +2285,13 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: setne_zero:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne_zero:
@@ -2335,13 +2309,13 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: setne_zero:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a4, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne_zero:
@@ -2382,13 +2356,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: seteq_constant:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 123
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_constant:
@@ -2409,13 +2383,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: seteq_constant:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xori a0, a0, 123
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_constant:
@@ -2457,13 +2431,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: setne_constant:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 456
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne_constant:
@@ -2484,13 +2458,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: setne_constant:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xori a0, a0, 456
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a4, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne_constant:
@@ -2532,13 +2506,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: seteq_2048:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    binvi a0, a0, 11
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_2048:
@@ -2559,13 +2533,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: seteq_2048:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    binvi a0, a0, 11
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_2048:
@@ -2609,13 +2583,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    not a1, a1
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_neg2048:
@@ -2637,13 +2611,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    not a1, a1
 ; RV32ZICOND-NEXT:    xori a0, a0, -2048
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_neg2048:
@@ -2687,13 +2661,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    not a1, a1
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne_neg2048:
@@ -2715,13 +2689,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    not a1, a1
 ; RV32ZICOND-NEXT:    xori a0, a0, -2048
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a4, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne_neg2048:
@@ -4097,10 +4071,10 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    feq.s a4, fa0, fa1
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a4
-; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a2
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a4
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setune_64:
@@ -4122,10 +4096,10 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    feq.s a4, fa0, fa1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setune_64:
diff --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll
index 3b376626a783d2..53de36f1699a93 100644
--- a/llvm/test/CodeGen/RISCV/copysign-casts.ll
+++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll
@@ -45,8 +45,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32I-LABEL: fold_promote_d_s:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a3, 524288
-; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -54,9 +54,9 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV64I-LABEL: fold_promote_d_s:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -65,8 +65,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    fmv.x.w a2, fa0
 ; RV32IF-NEXT:    lui a3, 524288
-; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    slli a1, a1, 1
+; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    srli a1, a1, 1
 ; RV32IF-NEXT:    or a1, a1, a2
 ; RV32IF-NEXT:    ret
@@ -87,8 +87,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32IFZFH:       # %bb.0:
 ; RV32IFZFH-NEXT:    fmv.x.w a2, fa0
 ; RV32IFZFH-NEXT:    lui a3, 524288
-; RV32IFZFH-NEXT:    and a2, a2, a3
 ; RV32IFZFH-NEXT:    slli a1, a1, 1
+; RV32IFZFH-NEXT:    and a2, a2, a3
 ; RV32IFZFH-NEXT:    srli a1, a1, 1
 ; RV32IFZFH-NEXT:    or a1, a1, a2
 ; RV32IFZFH-NEXT:    ret
@@ -109,8 +109,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    fmv.x.w a2, fa0
 ; RV32IFZFHMIN-NEXT:    lui a3, 524288
-; RV32IFZFHMIN-NEXT:    and a2, a2, a3
 ; RV32IFZFHMIN-NEXT:    slli a1, a1, 1
+; RV32IFZFHMIN-NEXT:    and a2, a2, a3
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    or a1, a1, a2
 ; RV32IFZFHMIN-NEXT:    ret
@@ -147,9 +147,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32I-LABEL: fold_promote_d_h:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a3, 8
+; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a2, a2, 16
-; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -157,9 +157,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV64I-LABEL: fold_promote_d_h:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -168,9 +168,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    fmv.x.w a2, fa0
 ; RV32IF-NEXT:    lui a3, 8
+; RV32IF-NEXT:    slli a1, a1, 1
 ; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    slli a2, a2, 16
-; RV32IF-NEXT:    slli a1, a1, 1
 ; RV32IF-NEXT:    srli a1, a1, 1
 ; RV32IF-NEXT:    or a1, a1, a2
 ; RV32IF-NEXT:    ret
@@ -209,9 +209,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32IFZFH:       # %bb.0:
 ; RV32IFZFH-NEXT:    fmv.x.h a2, fa0
 ; RV32IFZFH-NEXT:    lui a3, 8
+; RV32IFZFH-NEXT:    slli a1, a1, 1
 ; RV32IFZFH-NEXT:    and a2, a2, a3
 ; RV32IFZFH-NEXT:    slli a2, a2, 16
-; RV32IFZFH-NEXT:    slli a1, a1, 1
 ; RV32IFZFH-NEXT:    srli a1, a1, 1
 ; RV32IFZFH-NEXT:    or a1, a1, a2
 ; RV32IFZFH-NEXT:    ret
@@ -232,9 +232,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    fmv.x.h a2, fa0
 ; RV32IFZFHMIN-NEXT:    lui a3, 8
+; RV32IFZFHMIN-NEXT:    slli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    and a2, a2, a3
 ; RV32IFZFHMIN-NEXT:    slli a2, a2, 16
-; RV32IFZFHMIN-NEXT:    slli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    or a1, a1, a2
 ; RV32IFZFHMIN-NEXT:    ret
@@ -292,9 +292,9 @@ define float @fold_promote_f_h(float %a, half %b) nounwind {
 ; RV32I-LABEL: fold_promote_f_h:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 8
+; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -302,9 +302,9 @@ define float @fold_promote_f_h(float %a, half %b) nounwind {
 ; RV64I-LABEL: fold_promote_f_h:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    slli a0, a0, 33
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a1, a1, 16
-; RV64I-NEXT:    slli a0, a0, 33
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -423,8 +423,8 @@ define float @fold_demote_s_d(float %a, double %b) nounwind {
 ; RV32I-LABEL: fold_demote_s_d:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -432,8 +432,8 @@ define float @fold_demote_s_d(float %a, double %b) nounwind {
 ; RV64I-LABEL: fold_demote_s_d:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    srli a1, a1, 63
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a1
@@ -515,9 +515,9 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32I-LABEL: fold_demote_h_s:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
+; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a1, a1, 16
-; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -525,8 +525,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64I-LABEL: fold_demote_h_s:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a1, 31
-; RV64I-NEXT:    slli a1, a1, 15
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    slli a1, a1, 15
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -537,8 +537,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IF-NEXT:    fmv.x.w a1, fa1
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    and a1, a1, a2
-; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    slli a0, a0, 17
+; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    srli a0, a0, 17
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    lui a1, 1048560
@@ -552,8 +552,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFD-NEXT:    fmv.x.w a1, fa1
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    and a1, a1, a2
-; RV32IFD-NEXT:    srli a1, a1, 16
 ; RV32IFD-NEXT:    slli a0, a0, 17
+; RV32IFD-NEXT:    srli a1, a1, 16
 ; RV32IFD-NEXT:    srli a0, a0, 17
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    lui a1, 1048560
@@ -567,8 +567,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IFD-NEXT:    fmv.x.w a1, fa1
 ; RV64IFD-NEXT:    lui a2, 524288
 ; RV64IFD-NEXT:    and a1, a1, a2
-; RV64IFD-NEXT:    srli a1, a1, 16
 ; RV64IFD-NEXT:    slli a0, a0, 49
+; RV64IFD-NEXT:    srli a1, a1, 16
 ; RV64IFD-NEXT:    srli a0, a0, 49
 ; RV64IFD-NEXT:    or a0, a0, a1
 ; RV64IFD-NEXT:    lui a1, 1048560
@@ -597,10 +597,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFZFHMIN-LABEL: fold_demote_h_s:
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV32IFZFHMIN-NEXT:    srli a0, a0, 31
-; RV32IFZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV32IFZFHMIN-NEXT:    srli a0, a0, 31
 ; RV32IFZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IFZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 17
 ; RV32IFZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IFZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -609,10 +609,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFDZFHMIN-LABEL: fold_demote_h_s:
 ; RV32IFDZFHMIN:       # %bb.0:
 ; RV32IFDZFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
-; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
 ; RV32IFDZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    srli a1, a1, 17
 ; RV32IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -621,10 +621,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IFDZFHMIN-LABEL: fold_demote_h_s:
 ; RV64IFDZFHMIN:       # %bb.0:
 ; RV64IFDZFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV64IFDZFHMIN-NEXT:    srli a0, a0, 31
-; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64IFDZFHMIN-NEXT:    srli a0, a0, 31
 ; RV64IFDZFHMIN-NEXT:    slli a1, a1, 49
+; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    srli a1, a1, 49
 ; RV64IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV64IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -635,11 +635,11 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IZDINX-NEXT:    # kill: def $x11_w killed $x11_w def $x11
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV32IZDINX-NEXT:    lui a2, 524288
-; RV32IZDINX-NEXT:    and a1, a1, a2
-; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    slli a0, a0, 17
-; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    and a1, a1, a2
 ; RV32IZDINX-NEXT:    lui a2, 1048560
+; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    or a0, a0, a2
 ; RV32IZDINX-NEXT:    or a0, a0, a1
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -650,11 +650,11 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IZDINX-NEXT:    # kill: def $x11_w killed $x11_w def $x11
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV64IZDINX-NEXT:    lui a2, 524288
-; RV64IZDINX-NEXT:    and a1, a1, a2
-; RV64IZDINX-NEXT:    srli a1, a1, 16
 ; RV64IZDINX-NEXT:    slli a0, a0, 49
-; RV64IZDINX-NEXT:    srli a0, a0, 49
+; RV64IZDINX-NEXT:    and a1, a1, a2
 ; RV64IZDINX-NEXT:    lui a2, 1048560
+; RV64IZDINX-NEXT:    srli a0, a0, 49
+; RV64IZDINX-NEXT:    srli a1, a1, 16
 ; RV64IZDINX-NEXT:    or a0, a0, a2
 ; RV64IZDINX-NEXT:    or a0, a0, a1
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -668,9 +668,9 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32I-LABEL: fold_demote_h_d:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    srli a1, a1, 16
-; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -678,8 +678,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64I-LABEL: fold_demote_h_d:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 49
-; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    srli a1, a1, 63
+; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    or a0, a0, a1
@@ -690,8 +690,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IF-NEXT:    fmv.x.w a0, fa0
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    and a1, a1, a2
-; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    slli a0, a0, 17
+; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    srli a0, a0, 17
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    lui a1, 1048560
@@ -707,10 +707,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFD-NEXT:    fmv.x.w a1, fa0
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    and a0, a0, a2
-; RV32IFD-NEXT:    srli a0, a0, 16
+; RV32IFD-NEXT:    lui a2, 1048560
 ; RV32IFD-NEXT:    slli a1, a1, 17
 ; RV32IFD-NEXT:    srli a1, a1, 17
-; RV32IFD-NEXT:    lui a2, 1048560
+; RV32IFD-NEXT:    srli a0, a0, 16
 ; RV32IFD-NEXT:    or a1, a1, a2
 ; RV32IFD-NEXT:    or a0, a1, a0
 ; RV32IFD-NEXT:    fmv.w.x fa0, a0
@@ -721,12 +721,12 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IFD:       # %bb.0:
 ; RV64IFD-NEXT:    fmv.x.d a0, fa1
 ; RV64IFD-NEXT:    fmv.x.w a1, fa0
+; RV64IFD-NEXT:    lui a2, 1048560
 ; RV64IFD-NEXT:    slli a1, a1, 49
-; RV64IFD-NEXT:    srli a1, a1, 49
 ; RV64IFD-NEXT:    srli a0, a0, 63
+; RV64IFD-NEXT:    srli a1, a1, 49
 ; RV64IFD-NEXT:    slli a0, a0, 63
 ; RV64IFD-NEXT:    srli a0, a0, 48
-; RV64IFD-NEXT:    lui a2, 1048560
 ; RV64IFD-NEXT:    or a1, a1, a2
 ; RV64IFD-NEXT:    or a0, a1, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a0
@@ -754,8 +754,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFZFHMIN-LABEL: fold_demote_h_d:
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 31
-; RV32IFZFHMIN-NEXT:    slli a1, a1, 15
 ; RV32IFZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV32IFZFHMIN-NEXT:    slli a1, a1, 15
 ; RV32IFZFHMIN-NEXT:    slli a0, a0, 17
 ; RV32IFZFHMIN-NEXT:    srli a0, a0, 17
 ; RV32IFZFHMIN-NEXT:    or a0, a0, a1
@@ -767,10 +767,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFDZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IFDZFHMIN-NEXT:    fsd fa1, 8(sp)
 ; RV32IFDZFHMIN-NEXT:    lw a0, 12(sp)
-; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
-; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
 ; RV32IFDZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
+; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    srli a1, a1, 17
 ; RV32IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -780,10 +780,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IFDZFHMIN-LABEL: fold_demote_h_d:
 ; RV64IFDZFHMIN:       # %bb.0:
 ; RV64IFDZFHMIN-NEXT:    fmv.x.d a0, fa1
-; RV64IFDZFHMIN-NEXT:    srli a0, a0, 63
-; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64IFDZFHMIN-NEXT:    srli a0, a0, 63
 ; RV64IFDZFHMIN-NEXT:    slli a1, a1, 49
+; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    srli a1, a1, 49
 ; RV64IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV64IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -793,11 +793,11 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IZDINX:       # %bb.0:
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV32IZDINX-NEXT:    lui a1, 524288
-; RV32IZDINX-NEXT:    and a1, a2, a1
-; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    slli a0, a0, 17
-; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    and a1, a2, a1
 ; RV32IZDINX-NEXT:    lui a2, 1048560
+; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    or a0, a0, a2
 ; RV32IZDINX-NEXT:    or a0, a0, a1
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -807,11 +807,11 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IZDINX:       # %bb.0:
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV64IZDINX-NEXT:    slli a0, a0, 49
-; RV64IZDINX-NEXT:    srli a0, a0, 49
 ; RV64IZDINX-NEXT:    srli a1, a1, 63
+; RV64IZDINX-NEXT:    lui a2, 1048560
+; RV64IZDINX-NEXT:    srli a0, a0, 49
 ; RV64IZDINX-NEXT:    slli a1, a1, 63
 ; RV64IZDINX-NEXT:    srli a1, a1, 48
-; RV64IZDINX-NEXT:    lui a2, 1048560
 ; RV64IZDINX-NEXT:    or a0, a0, a2
 ; RV64IZDINX-NEXT:    or a0, a0, a1
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 2c691a2de4c4de..da97ac0d742379 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -131,18 +131,18 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV32_NOZBB-NEXT:    addi a1, a0, -1
 ; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
 ; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
-; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    and a0, a0, a1
+; RV32_NOZBB-NEXT:    addi a1, a2, 1365
+; RV32_NOZBB-NEXT:    srli a2, a0, 1
+; RV32_NOZBB-NEXT:    and a1, a2, a1
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -161,18 +161,18 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV64NOZBB-NEXT:    addi a1, a0, -1
 ; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    addiw a1, a2, 1365
+; RV64NOZBB-NEXT:    srli a2, a0, 1
+; RV64NOZBB-NEXT:    and a1, a2, a1
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -620,18 +620,18 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB:       # %bb.0:
 ; RV32_NOZBB-NEXT:    addi a1, a0, -1
 ; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
 ; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
-; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    and a0, a0, a1
+; RV32_NOZBB-NEXT:    addi a1, a2, 1365
+; RV32_NOZBB-NEXT:    srli a2, a0, 1
+; RV32_NOZBB-NEXT:    and a1, a2, a1
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -644,18 +644,18 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
 ; RV64NOZBB:       # %bb.0:
 ; RV64NOZBB-NEXT:    addi a1, a0, -1
 ; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    addiw a1, a2, 1365
+; RV64NOZBB-NEXT:    srli a2, a0, 1
+; RV64NOZBB-NEXT:    and a1, a2, a1
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -1052,28 +1052,28 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    beqz a1, .LBB9_2
 ; RV32_NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV32_NOZBB-NEXT:    srli a1, a1, 17
+; RV32_NOZBB-NEXT:    lui a2, 5
 ; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 16
-; RV32_NOZBB-NEXT:    srli a1, a1, 18
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 16
-; RV32_NOZBB-NEXT:    srli a1, a1, 20
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 16
-; RV32_NOZBB-NEXT:    srli a1, a1, 24
-; RV32_NOZBB-NEXT:    or a0, a0, a1
+; RV32_NOZBB-NEXT:    addi a1, a2, 1365
+; RV32_NOZBB-NEXT:    slli a2, a0, 16
+; RV32_NOZBB-NEXT:    srli a2, a2, 18
+; RV32_NOZBB-NEXT:    or a0, a0, a2
+; RV32_NOZBB-NEXT:    slli a2, a0, 16
+; RV32_NOZBB-NEXT:    srli a2, a2, 20
+; RV32_NOZBB-NEXT:    or a0, a0, a2
+; RV32_NOZBB-NEXT:    slli a2, a0, 16
+; RV32_NOZBB-NEXT:    srli a2, a2, 24
+; RV32_NOZBB-NEXT:    or a0, a0, a2
 ; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
-; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    srli a2, a0, 1
+; RV32_NOZBB-NEXT:    and a1, a2, a1
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -1091,28 +1091,28 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:    beqz a1, .LBB9_2
 ; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV64NOZBB-NEXT:    srli a1, a1, 49
+; RV64NOZBB-NEXT:    lui a2, 5
 ; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 50
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 52
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 56
-; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    addiw a1, a2, 1365
+; RV64NOZBB-NEXT:    slli a2, a0, 48
+; RV64NOZBB-NEXT:    srli a2, a2, 50
+; RV64NOZBB-NEXT:    or a0, a0, a2
+; RV64NOZBB-NEXT:    slli a2, a0, 48
+; RV64NOZBB-NEXT:    srli a2, a2, 52
+; RV64NOZBB-NEXT:    or a0, a0, a2
+; RV64NOZBB-NEXT:    slli a2, a0, 48
+; RV64NOZBB-NEXT:    srli a2, a2, 56
+; RV64NOZBB-NEXT:    or a0, a0, a2
 ; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    srli a2, a0, 1
+; RV64NOZBB-NEXT:    and a1, a2, a1
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -1161,31 +1161,31 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB10_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -1203,31 +1203,31 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB10_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -1244,33 +1244,33 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32M-NEXT:    beqz a0, .LBB10_2
 ; RV32M-NEXT:  # %bb.1: # %cond.false
 ; RV32M-NEXT:    srli a1, a0, 1
+; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 2
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 4
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 8
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 16
-; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    addi a1, a2, 1365
+; RV32M-NEXT:    srli a2, a0, 2
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 4
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 8
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 16
+; RV32M-NEXT:    or a0, a0, a2
 ; RV32M-NEXT:    not a0, a0
-; RV32M-NEXT:    srli a1, a0, 1
-; RV32M-NEXT:    lui a2, 349525
-; RV32M-NEXT:    addi a2, a2, 1365
-; RV32M-NEXT:    and a1, a1, a2
+; RV32M-NEXT:    srli a2, a0, 1
+; RV32M-NEXT:    and a1, a2, a1
+; RV32M-NEXT:    lui a2, 209715
+; RV32M-NEXT:    addi a2, a2, 819
 ; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
-; RV32M-NEXT:    and a2, a0, a1
+; RV32M-NEXT:    and a1, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    and a0, a0, a2
+; RV32M-NEXT:    lui a2, 61681
+; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 61681
-; RV32M-NEXT:    addi a1, a1, -241
-; RV32M-NEXT:    and a0, a0, a1
 ; RV32M-NEXT:    lui a1, 4112
+; RV32M-NEXT:    addi a2, a2, -241
+; RV32M-NEXT:    and a0, a0, a2
 ; RV32M-NEXT:    addi a1, a1, 257
 ; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    srli a0, a0, 24
@@ -1285,33 +1285,33 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    beqz a1, .LBB10_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
 ; RV64M-NEXT:    srliw a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    srliw a2, a0, 2
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 4
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 8
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 16
+; RV64M-NEXT:    or a0, a0, a2
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a2, a0, 1
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    addiw a2, a2, 819
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 61681
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 4112
+; RV64M-NEXT:    addi a2, a2, -241
+; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    addi a1, a1, 257
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srliw a0, a0, 24
@@ -1349,11 +1349,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -1420,40 +1420,40 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB11_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -1469,13 +1469,13 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-LABEL: test_ctlz_i64:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 349525
+; RV32M-NEXT:    lui a3, 209715
+; RV32M-NEXT:    lui a6, 61681
+; RV32M-NEXT:    lui a7, 4112
 ; RV32M-NEXT:    addi a5, a2, 1365
-; RV32M-NEXT:    lui a2, 209715
-; RV32M-NEXT:    addi a4, a2, 819
-; RV32M-NEXT:    lui a2, 61681
-; RV32M-NEXT:    addi a2, a2, -241
-; RV32M-NEXT:    lui a3, 4112
-; RV32M-NEXT:    addi a3, a3, 257
+; RV32M-NEXT:    addi a4, a3, 819
+; RV32M-NEXT:    addi a3, a6, -241
+; RV32M-NEXT:    addi a2, a7, 257
 ; RV32M-NEXT:    bnez a1, .LBB11_2
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    srli a1, a0, 1
@@ -1498,8 +1498,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    addi a0, a0, 32
 ; RV32M-NEXT:    li a1, 0
@@ -1525,8 +1525,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
@@ -1536,44 +1536,44 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64M-NEXT:    beqz a0, .LBB11_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
 ; RV64M-NEXT:    srli a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
+; RV64M-NEXT:    lui a3, 209715
+; RV64M-NEXT:    lui a4, 61681
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 32
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    addiw a2, a3, 819
+; RV64M-NEXT:    addiw a3, a4, -241
+; RV64M-NEXT:    srli a4, a0, 2
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    slli a4, a1, 32
+; RV64M-NEXT:    add a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 32
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a3, 32
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    srli a4, a0, 4
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 8
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 16
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 32
+; RV64M-NEXT:    or a0, a0, a4
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a4, a0, 1
+; RV64M-NEXT:    and a1, a4, a1
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 4112
+; RV64M-NEXT:    addiw a2, a2, 257
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addiw a1, a1, -241
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a1, a2, 32
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    add a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -1700,7 +1700,9 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB-LABEL: test_ctlz_i16_zero_undef:
 ; RV32_NOZBB:       # %bb.0:
 ; RV32_NOZBB-NEXT:    slli a1, a0, 16
+; RV32_NOZBB-NEXT:    lui a2, 5
 ; RV32_NOZBB-NEXT:    srli a1, a1, 17
+; RV32_NOZBB-NEXT:    addi a2, a2, 1365
 ; RV32_NOZBB-NEXT:    or a0, a0, a1
 ; RV32_NOZBB-NEXT:    slli a1, a0, 16
 ; RV32_NOZBB-NEXT:    srli a1, a1, 18
@@ -1713,16 +1715,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    or a0, a0, a1
 ; RV32_NOZBB-NEXT:    not a0, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
 ; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -1734,7 +1734,9 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
 ; RV64NOZBB:       # %bb.0:
 ; RV64NOZBB-NEXT:    slli a1, a0, 48
+; RV64NOZBB-NEXT:    lui a2, 5
 ; RV64NOZBB-NEXT:    srli a1, a1, 49
+; RV64NOZBB-NEXT:    addiw a2, a2, 1365
 ; RV64NOZBB-NEXT:    or a0, a0, a1
 ; RV64NOZBB-NEXT:    slli a1, a0, 48
 ; RV64NOZBB-NEXT:    srli a1, a1, 50
@@ -1747,16 +1749,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:    or a0, a0, a1
 ; RV64NOZBB-NEXT:    not a0, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
 ; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -1796,31 +1796,31 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i32_zero_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -1832,31 +1832,31 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64I-LABEL: test_ctlz_i32_zero_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -1868,33 +1868,33 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32M-LABEL: test_ctlz_i32_zero_undef:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    srli a1, a0, 1
+; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 2
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 4
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 8
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 16
-; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    addi a1, a2, 1365
+; RV32M-NEXT:    srli a2, a0, 2
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 4
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 8
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 16
+; RV32M-NEXT:    or a0, a0, a2
 ; RV32M-NEXT:    not a0, a0
-; RV32M-NEXT:    srli a1, a0, 1
-; RV32M-NEXT:    lui a2, 349525
-; RV32M-NEXT:    addi a2, a2, 1365
-; RV32M-NEXT:    and a1, a1, a2
+; RV32M-NEXT:    srli a2, a0, 1
+; RV32M-NEXT:    and a1, a2, a1
+; RV32M-NEXT:    lui a2, 209715
+; RV32M-NEXT:    addi a2, a2, 819
 ; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
-; RV32M-NEXT:    and a2, a0, a1
+; RV32M-NEXT:    and a1, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    and a0, a0, a2
+; RV32M-NEXT:    lui a2, 61681
+; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 61681
-; RV32M-NEXT:    addi a1, a1, -241
-; RV32M-NEXT:    and a0, a0, a1
 ; RV32M-NEXT:    lui a1, 4112
+; RV32M-NEXT:    addi a2, a2, -241
+; RV32M-NEXT:    and a0, a0, a2
 ; RV32M-NEXT:    addi a1, a1, 257
 ; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    srli a0, a0, 24
@@ -1903,33 +1903,33 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64M-LABEL: test_ctlz_i32_zero_undef:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    srliw a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    srliw a2, a0, 2
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 4
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 8
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 16
+; RV64M-NEXT:    or a0, a0, a2
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a2, a0, 1
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    addiw a2, a2, 819
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 61681
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 4112
+; RV64M-NEXT:    addi a2, a2, -241
+; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    addi a1, a1, 257
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srliw a0, a0, 24
@@ -1964,11 +1964,11 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64_zero_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB15_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -2033,40 +2033,40 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV64I-LABEL: test_ctlz_i64_zero_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -2079,13 +2079,13 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32M-LABEL: test_ctlz_i64_zero_undef:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 349525
+; RV32M-NEXT:    lui a3, 209715
+; RV32M-NEXT:    lui a6, 61681
+; RV32M-NEXT:    lui a7, 4112
 ; RV32M-NEXT:    addi a5, a2, 1365
-; RV32M-NEXT:    lui a2, 209715
-; RV32M-NEXT:    addi a4, a2, 819
-; RV32M-NEXT:    lui a2, 61681
-; RV32M-NEXT:    addi a2, a2, -241
-; RV32M-NEXT:    lui a3, 4112
-; RV32M-NEXT:    addi a3, a3, 257
+; RV32M-NEXT:    addi a4, a3, 819
+; RV32M-NEXT:    addi a3, a6, -241
+; RV32M-NEXT:    addi a2, a7, 257
 ; RV32M-NEXT:    bnez a1, .LBB15_2
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    srli a1, a0, 1
@@ -2108,8 +2108,8 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    addi a0, a0, 32
 ; RV32M-NEXT:    li a1, 0
@@ -2135,8 +2135,8 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
@@ -2144,44 +2144,44 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV64M-LABEL: test_ctlz_i64_zero_undef:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    srli a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
+; RV64M-NEXT:    lui a3, 209715
+; RV64M-NEXT:    lui a4, 61681
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 32
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    addiw a2, a3, 819
+; RV64M-NEXT:    addiw a3, a4, -241
+; RV64M-NEXT:    srli a4, a0, 2
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    slli a4, a1, 32
+; RV64M-NEXT:    add a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 32
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a3, 32
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    srli a4, a0, 4
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 8
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 16
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 32
+; RV64M-NEXT:    or a0, a0, a4
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a4, a0, 1
+; RV64M-NEXT:    and a1, a4, a1
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 4112
+; RV64M-NEXT:    addiw a2, a2, 257
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addiw a1, a1, -241
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a1, a2, 32
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    add a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -2304,13 +2304,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    lui a2, 5
 ; RV32_NOZBB-NEXT:    addi a2, a2, 1365
 ; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -2325,13 +2325,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:    lui a2, 5
 ; RV64NOZBB-NEXT:    addiw a2, a2, 1365
 ; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -2358,13 +2358,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV32XTHEADBB-NEXT:    lui a2, 5
 ; RV32XTHEADBB-NEXT:    addi a2, a2, 1365
 ; RV32XTHEADBB-NEXT:    and a1, a1, a2
+; RV32XTHEADBB-NEXT:    lui a2, 3
+; RV32XTHEADBB-NEXT:    addi a2, a2, 819
 ; RV32XTHEADBB-NEXT:    sub a0, a0, a1
-; RV32XTHEADBB-NEXT:    lui a1, 3
-; RV32XTHEADBB-NEXT:    addi a1, a1, 819
-; RV32XTHEADBB-NEXT:    and a2, a0, a1
+; RV32XTHEADBB-NEXT:    and a1, a0, a2
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 2
-; RV32XTHEADBB-NEXT:    and a0, a0, a1
-; RV32XTHEADBB-NEXT:    add a0, a2, a0
+; RV32XTHEADBB-NEXT:    and a0, a0, a2
+; RV32XTHEADBB-NEXT:    add a0, a1, a0
 ; RV32XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    th.extu a1, a0, 11, 8
@@ -2378,13 +2378,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV64XTHEADBB-NEXT:    lui a2, 5
 ; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
 ; RV64XTHEADBB-NEXT:    and a1, a1, a2
+; RV64XTHEADBB-NEXT:    lui a2, 3
+; RV64XTHEADBB-NEXT:    addiw a2, a2, 819
 ; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 3
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
+; RV64XTHEADBB-NEXT:    and a1, a0, a2
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    th.extu a1, a0, 11, 8
@@ -2402,17 +2402,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -2427,17 +2427,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -2452,19 +2452,19 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    addi a2, a2, 1365
 ; RV32M-NEXT:    and a1, a1, a2
+; RV32M-NEXT:    lui a2, 209715
+; RV32M-NEXT:    addi a2, a2, 819
 ; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
-; RV32M-NEXT:    and a2, a0, a1
+; RV32M-NEXT:    and a1, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    and a0, a0, a2
+; RV32M-NEXT:    lui a2, 61681
+; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 61681
-; RV32M-NEXT:    addi a1, a1, -241
-; RV32M-NEXT:    and a0, a0, a1
 ; RV32M-NEXT:    lui a1, 4112
+; RV32M-NEXT:    addi a2, a2, -241
+; RV32M-NEXT:    and a0, a0, a2
 ; RV32M-NEXT:    addi a1, a1, 257
 ; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    srli a0, a0, 24
@@ -2476,19 +2476,19 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    addiw a2, a2, 819
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 61681
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 4112
+; RV64M-NEXT:    addi a2, a2, -241
+; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    addi a1, a1, 257
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srliw a0, a0, 24
@@ -2510,17 +2510,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV32XTHEADBB-NEXT:    lui a2, 349525
 ; RV32XTHEADBB-NEXT:    addi a2, a2, 1365
 ; RV32XTHEADBB-NEXT:    and a1, a1, a2
+; RV32XTHEADBB-NEXT:    lui a2, 209715
+; RV32XTHEADBB-NEXT:    addi a2, a2, 819
 ; RV32XTHEADBB-NEXT:    sub a0, a0, a1
-; RV32XTHEADBB-NEXT:    lui a1, 209715
-; RV32XTHEADBB-NEXT:    addi a1, a1, 819
-; RV32XTHEADBB-NEXT:    and a2, a0, a1
+; RV32XTHEADBB-NEXT:    and a1, a0, a2
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 2
-; RV32XTHEADBB-NEXT:    and a0, a0, a1
-; RV32XTHEADBB-NEXT:    add a0, a2, a0
+; RV32XTHEADBB-NEXT:    and a0, a0, a2
+; RV32XTHEADBB-NEXT:    lui a2, 61681
+; RV32XTHEADBB-NEXT:    add a0, a1, a0
 ; RV32XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
-; RV32XTHEADBB-NEXT:    lui a1, 61681
-; RV32XTHEADBB-NEXT:    addi a1, a1, -241
+; RV32XTHEADBB-NEXT:    addi a1, a2, -241
 ; RV32XTHEADBB-NEXT:    and a0, a0, a1
 ; RV32XTHEADBB-NEXT:    slli a1, a0, 8
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
@@ -2535,17 +2535,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64XTHEADBB-NEXT:    lui a2, 349525
 ; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
 ; RV64XTHEADBB-NEXT:    and a1, a1, a2
+; RV64XTHEADBB-NEXT:    lui a2, 209715
+; RV64XTHEADBB-NEXT:    addiw a2, a2, 819
 ; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 209715
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
+; RV64XTHEADBB-NEXT:    and a1, a0, a2
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    lui a2, 61681
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 61681
-; RV64XTHEADBB-NEXT:    addi a1, a1, -241
+; RV64XTHEADBB-NEXT:    addi a1, a2, -241
 ; RV64XTHEADBB-NEXT:    and a0, a0, a1
 ; RV64XTHEADBB-NEXT:    slli a1, a0, 8
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
@@ -2562,39 +2562,39 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a4, a1, a2
-; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 4
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a5, a1, 8
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 16
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    sub a0, a0, a3
-; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a4
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a3, a0, a4
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    slli a2, a1, 8
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    slli a2, a1, 16
+; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    li a1, 0
@@ -2602,28 +2602,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64I-LABEL: test_ctpop_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -2637,35 +2637,35 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    srli a2, a1, 1
 ; RV32M-NEXT:    lui a3, 349525
+; RV32M-NEXT:    lui a4, 209715
+; RV32M-NEXT:    lui a5, 61681
+; RV32M-NEXT:    srli a6, a0, 1
 ; RV32M-NEXT:    addi a3, a3, 1365
 ; RV32M-NEXT:    and a2, a2, a3
-; RV32M-NEXT:    sub a1, a1, a2
-; RV32M-NEXT:    lui a2, 209715
-; RV32M-NEXT:    addi a2, a2, 819
-; RV32M-NEXT:    and a4, a1, a2
-; RV32M-NEXT:    srli a1, a1, 2
-; RV32M-NEXT:    and a1, a1, a2
-; RV32M-NEXT:    add a1, a4, a1
-; RV32M-NEXT:    srli a4, a1, 4
-; RV32M-NEXT:    add a1, a1, a4
-; RV32M-NEXT:    lui a4, 61681
-; RV32M-NEXT:    addi a4, a4, -241
-; RV32M-NEXT:    and a1, a1, a4
-; RV32M-NEXT:    lui a5, 4112
-; RV32M-NEXT:    addi a5, a5, 257
-; RV32M-NEXT:    mul a1, a1, a5
-; RV32M-NEXT:    srli a1, a1, 24
-; RV32M-NEXT:    srli a6, a0, 1
 ; RV32M-NEXT:    and a3, a6, a3
+; RV32M-NEXT:    lui a6, 4112
+; RV32M-NEXT:    addi a4, a4, 819
+; RV32M-NEXT:    addi a5, a5, -241
+; RV32M-NEXT:    addi a6, a6, 257
+; RV32M-NEXT:    sub a1, a1, a2
 ; RV32M-NEXT:    sub a0, a0, a3
-; RV32M-NEXT:    and a3, a0, a2
+; RV32M-NEXT:    and a2, a1, a4
+; RV32M-NEXT:    srli a1, a1, 2
+; RV32M-NEXT:    and a3, a0, a4
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    add a0, a3, a0
-; RV32M-NEXT:    srli a2, a0, 4
-; RV32M-NEXT:    add a0, a0, a2
+; RV32M-NEXT:    and a1, a1, a4
 ; RV32M-NEXT:    and a0, a0, a4
-; RV32M-NEXT:    mul a0, a0, a5
+; RV32M-NEXT:    add a1, a2, a1
+; RV32M-NEXT:    add a0, a3, a0
+; RV32M-NEXT:    srli a2, a1, 4
+; RV32M-NEXT:    srli a3, a0, 4
+; RV32M-NEXT:    add a1, a1, a2
+; RV32M-NEXT:    add a0, a0, a3
+; RV32M-NEXT:    and a1, a1, a5
+; RV32M-NEXT:    and a0, a0, a5
+; RV32M-NEXT:    mul a1, a1, a6
+; RV32M-NEXT:    mul a0, a0, a6
+; RV32M-NEXT:    srli a1, a1, 24
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    add a0, a0, a1
 ; RV32M-NEXT:    li a1, 0
@@ -2673,32 +2673,32 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64M-LABEL: test_ctpop_i64:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    lui a1, 349525
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    lui a3, 61681
+; RV64M-NEXT:    addiw a1, a1, 1365
+; RV64M-NEXT:    addiw a2, a2, 819
+; RV64M-NEXT:    addiw a3, a3, -241
+; RV64M-NEXT:    slli a4, a1, 32
+; RV64M-NEXT:    add a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 32
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a3, 32
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    srli a4, a0, 1
+; RV64M-NEXT:    and a1, a4, a1
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 4112
+; RV64M-NEXT:    addiw a2, a2, 257
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addiw a1, a1, -241
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a1, a2, 32
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    add a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -2720,39 +2720,39 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    srli a2, a1, 1
 ; RV32XTHEADBB-NEXT:    lui a3, 349525
+; RV32XTHEADBB-NEXT:    lui a4, 209715
+; RV32XTHEADBB-NEXT:    srli a5, a0, 1
 ; RV32XTHEADBB-NEXT:    addi a3, a3, 1365
 ; RV32XTHEADBB-NEXT:    and a2, a2, a3
-; RV32XTHEADBB-NEXT:    sub a1, a1, a2
-; RV32XTHEADBB-NEXT:    lui a2, 209715
-; RV32XTHEADBB-NEXT:    addi a2, a2, 819
-; RV32XTHEADBB-NEXT:    and a4, a1, a2
-; RV32XTHEADBB-NEXT:    srli a1, a1, 2
-; RV32XTHEADBB-NEXT:    and a1, a1, a2
-; RV32XTHEADBB-NEXT:    add a1, a4, a1
-; RV32XTHEADBB-NEXT:    srli a4, a1, 4
-; RV32XTHEADBB-NEXT:    add a1, a1, a4
-; RV32XTHEADBB-NEXT:    lui a4, 61681
-; RV32XTHEADBB-NEXT:    addi a4, a4, -241
-; RV32XTHEADBB-NEXT:    and a1, a1, a4
-; RV32XTHEADBB-NEXT:    slli a5, a1, 8
-; RV32XTHEADBB-NEXT:    add a1, a1, a5
-; RV32XTHEADBB-NEXT:    slli a5, a1, 16
-; RV32XTHEADBB-NEXT:    add a1, a1, a5
-; RV32XTHEADBB-NEXT:    srli a1, a1, 24
-; RV32XTHEADBB-NEXT:    srli a5, a0, 1
 ; RV32XTHEADBB-NEXT:    and a3, a5, a3
+; RV32XTHEADBB-NEXT:    lui a5, 61681
+; RV32XTHEADBB-NEXT:    addi a4, a4, 819
+; RV32XTHEADBB-NEXT:    addi a5, a5, -241
+; RV32XTHEADBB-NEXT:    sub a1, a1, a2
 ; RV32XTHEADBB-NEXT:    sub a0, a0, a3
-; RV32XTHEADBB-NEXT:    and a3, a0, a2
+; RV32XTHEADBB-NEXT:    and a2, a1, a4
+; RV32XTHEADBB-NEXT:    srli a1, a1, 2
+; RV32XTHEADBB-NEXT:    and a3, a0, a4
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 2
-; RV32XTHEADBB-NEXT:    and a0, a0, a2
-; RV32XTHEADBB-NEXT:    add a0, a3, a0
-; RV32XTHEADBB-NEXT:    srli a2, a0, 4
-; RV32XTHEADBB-NEXT:    add a0, a0, a2
+; RV32XTHEADBB-NEXT:    and a1, a1, a4
 ; RV32XTHEADBB-NEXT:    and a0, a0, a4
-; RV32XTHEADBB-NEXT:    slli a2, a0, 8
-; RV32XTHEADBB-NEXT:    add a0, a0, a2
-; RV32XTHEADBB-NEXT:    slli a2, a0, 16
-; RV32XTHEADBB-NEXT:    add a0, a0, a2
+; RV32XTHEADBB-NEXT:    add a1, a2, a1
+; RV32XTHEADBB-NEXT:    add a0, a3, a0
+; RV32XTHEADBB-NEXT:    srli a2, a1, 4
+; RV32XTHEADBB-NEXT:    srli a3, a0, 4
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a0, a0, a3
+; RV32XTHEADBB-NEXT:    and a1, a1, a5
+; RV32XTHEADBB-NEXT:    and a0, a0, a5
+; RV32XTHEADBB-NEXT:    slli a2, a1, 8
+; RV32XTHEADBB-NEXT:    slli a3, a0, 8
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a0, a0, a3
+; RV32XTHEADBB-NEXT:    slli a2, a1, 16
+; RV32XTHEADBB-NEXT:    slli a3, a0, 16
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a0, a0, a3
+; RV32XTHEADBB-NEXT:    srli a1, a1, 24
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 24
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    li a1, 0
@@ -2760,28 +2760,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: test_ctpop_i64:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    srli a1, a0, 1
-; RV64XTHEADBB-NEXT:    lui a2, 349525
-; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
+; RV64XTHEADBB-NEXT:    lui a1, 349525
+; RV64XTHEADBB-NEXT:    lui a2, 209715
+; RV64XTHEADBB-NEXT:    addiw a1, a1, 1365
+; RV64XTHEADBB-NEXT:    addiw a2, a2, 819
+; RV64XTHEADBB-NEXT:    slli a3, a1, 32
+; RV64XTHEADBB-NEXT:    add a1, a1, a3
 ; RV64XTHEADBB-NEXT:    slli a3, a2, 32
 ; RV64XTHEADBB-NEXT:    add a2, a2, a3
-; RV64XTHEADBB-NEXT:    and a1, a1, a2
+; RV64XTHEADBB-NEXT:    srli a3, a0, 1
+; RV64XTHEADBB-NEXT:    and a1, a3, a1
+; RV64XTHEADBB-NEXT:    lui a3, 61681
+; RV64XTHEADBB-NEXT:    addiw a3, a3, -241
 ; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 209715
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    slli a2, a1, 32
-; RV64XTHEADBB-NEXT:    add a1, a1, a2
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
+; RV64XTHEADBB-NEXT:    and a1, a0, a2
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    slli a2, a3, 32
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 61681
-; RV64XTHEADBB-NEXT:    addiw a1, a1, -241
-; RV64XTHEADBB-NEXT:    slli a2, a1, 32
-; RV64XTHEADBB-NEXT:    add a1, a1, a2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
+; RV64XTHEADBB-NEXT:    add a2, a3, a2
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
 ; RV64XTHEADBB-NEXT:    slli a1, a0, 8
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    slli a1, a0, 16
diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index fe6e20d852d590..03a6a6b1c4b7da 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -603,11 +603,11 @@ define signext i32 @ctlz(i64 %b) nounwind {
 ; RV32I-LABEL: ctlz:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB7_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -672,40 +672,40 @@ define signext i32 @ctlz(i64 %b) nounwind {
 ; RV64I-LABEL: ctlz:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 3d9fb91e3adf82..844fa0d1e6ad6e 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -80,25 +80,25 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
 ; RV32-LABEL: udiv64_constant_no_add:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 838861
-; RV32-NEXT:    addi a4, a3, -819
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 2
-; RV32-NEXT:    andi a5, a5, -4
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, -819
 ; RV32-NEXT:    addi a3, a3, -820
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    srli a6, a4, 2
+; RV32-NEXT:    andi a4, a4, -4
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    sub a4, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a4, a3
+; RV32-NEXT:    mulhu a3, a4, a5
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    mul a0, a4, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: udiv64_constant_no_add:
@@ -485,8 +485,8 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_no_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 86
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
 ; RV32IM-NEXT:    srli a0, a0, 8
@@ -506,8 +506,8 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_no_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 86
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srli a0, a0, 8
@@ -531,8 +531,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 103
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
 ; RV32IM-NEXT:    srai a0, a0, 9
@@ -552,8 +552,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 103
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srai a0, a0, 9
@@ -577,8 +577,8 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_add_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    li a2, -109
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 8
 ; RV32IM-NEXT:    add a0, a1, a0
@@ -604,8 +604,8 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_add_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    li a2, -109
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 8
 ; RV64IM-NEXT:    add a0, a1, a0
@@ -635,8 +635,8 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_sub_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    li a2, 109
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 8
 ; RV32IM-NEXT:    sub a1, a1, a0
@@ -662,8 +662,8 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_sub_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    li a2, 109
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 8
 ; RV64IM-NEXT:    subw a1, a1, a0
@@ -693,8 +693,8 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_no_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    lui a1, 5
+; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    addi a1, a1, 1366
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
@@ -716,8 +716,8 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_no_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    lui a1, 5
+; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    addiw a1, a1, 1366
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
@@ -743,8 +743,8 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    lui a1, 6
+; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    addi a1, a1, 1639
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
@@ -766,8 +766,8 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    lui a1, 6
+; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    addiw a1, a1, 1639
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
@@ -793,8 +793,8 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_add_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    lui a2, 1048569
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    addi a2, a2, -1911
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 16
@@ -822,8 +822,8 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_add_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    lui a2, 1048569
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    addiw a2, a2, -1911
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 16
@@ -855,8 +855,8 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_sub_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    lui a2, 7
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    addi a2, a2, 1911
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 16
@@ -884,8 +884,8 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_sub_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    lui a2, 7
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    addiw a2, a2, 1911
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 16
diff --git a/llvm/test/CodeGen/RISCV/div-pow2.ll b/llvm/test/CodeGen/RISCV/div-pow2.ll
index 254e675b4ed8b5..6ea5a37ba29635 100644
--- a/llvm/test/CodeGen/RISCV/div-pow2.ll
+++ b/llvm/test/CodeGen/RISCV/div-pow2.ll
@@ -207,14 +207,14 @@ define i64 @sdiv64_pow2_negative_2(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 1
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 31
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 31
+; RV32I-NEXT:    srai a2, a0, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_2:
@@ -263,14 +263,14 @@ define i64 @sdiv64_pow2_negative_2048(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 11
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 21
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 11
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 21
+; RV32I-NEXT:    srai a2, a0, 11
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_2048:
@@ -320,14 +320,14 @@ define i64 @sdiv64_pow2_negative_4096(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 12
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 20
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 12
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 20
+; RV32I-NEXT:    srai a2, a0, 12
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_4096:
@@ -377,14 +377,14 @@ define i64 @sdiv64_pow2_negative_65536(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 16
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 16
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a2, a0, 16
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_65536:
diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index e94efbea8376d5..bda6ff43a5e7c0 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -23,8 +23,8 @@ define i32 @udiv(i32 %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    call __udivdi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -182,25 +182,25 @@ define i64 @udiv64_constant(i64 %a) nounwind {
 ; RV32IM-LABEL: udiv64_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    add a2, a0, a1
-; RV32IM-NEXT:    sltu a3, a2, a0
-; RV32IM-NEXT:    add a2, a2, a3
 ; RV32IM-NEXT:    lui a3, 838861
-; RV32IM-NEXT:    addi a4, a3, -819
-; RV32IM-NEXT:    mulhu a5, a2, a4
-; RV32IM-NEXT:    srli a6, a5, 2
-; RV32IM-NEXT:    andi a5, a5, -4
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sub a5, a0, a2
+; RV32IM-NEXT:    sltu a4, a2, a0
+; RV32IM-NEXT:    addi a5, a3, -819
 ; RV32IM-NEXT:    addi a3, a3, -820
-; RV32IM-NEXT:    mul a3, a5, a3
-; RV32IM-NEXT:    mulhu a6, a5, a4
-; RV32IM-NEXT:    add a3, a6, a3
+; RV32IM-NEXT:    add a2, a2, a4
+; RV32IM-NEXT:    mulhu a4, a2, a5
+; RV32IM-NEXT:    srli a6, a4, 2
+; RV32IM-NEXT:    andi a4, a4, -4
+; RV32IM-NEXT:    add a4, a4, a6
+; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a4, a0, a2
 ; RV32IM-NEXT:    sltu a0, a0, a2
+; RV32IM-NEXT:    mul a2, a4, a3
+; RV32IM-NEXT:    mulhu a3, a4, a5
 ; RV32IM-NEXT:    sub a1, a1, a0
-; RV32IM-NEXT:    mul a1, a1, a4
-; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    mul a0, a5, a4
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    mul a1, a1, a5
+; RV32IM-NEXT:    add a1, a2, a1
+; RV32IM-NEXT:    mul a0, a4, a5
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: udiv64_constant:
@@ -919,8 +919,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    call __divsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -930,8 +930,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV32IM-LABEL: sdiv8:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    div a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -941,8 +941,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    call __divdi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -952,8 +952,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV64IM-LABEL: sdiv8:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    divw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -977,8 +977,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 103
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
 ; RV32IM-NEXT:    srai a0, a0, 9
@@ -1000,8 +1000,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 103
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srai a0, a0, 9
@@ -1105,8 +1105,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    call __divsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1116,8 +1116,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV32IM-LABEL: sdiv16:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    div a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -1127,8 +1127,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    call __divdi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1138,8 +1138,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV64IM-LABEL: sdiv16:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    divw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -1163,8 +1163,8 @@ define i16 @sdiv16_constant(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    lui a1, 6
+; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    addi a1, a1, 1639
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
@@ -1187,8 +1187,8 @@ define i16 @sdiv16_constant(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    lui a1, 6
+; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    addiw a1, a1, 1639
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll
index 5f06398daa8b9a..44d7bc590a797b 100644
--- a/llvm/test/CodeGen/RISCV/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/double-arith.ll
@@ -225,8 +225,8 @@ define double @fsgnj_d(double %a, double %b) nounwind {
 ; RV32I-LABEL: fsgnj_d:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -234,8 +234,8 @@ define double @fsgnj_d(double %a, double %b) nounwind {
 ; RV64I-LABEL: fsgnj_d:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 63
-; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -327,8 +327,8 @@ define double @fsgnjn_d(double %a, double %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a2, a3
 ; RV32I-NEXT:    lui a3, 524288
-; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -1524,8 +1524,8 @@ define double @fsgnjx_f64(double %x, double %y) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srli a0, a0, 63
-; RV64I-NEXT:    slli a0, a0, 63
 ; RV64I-NEXT:    li a2, 1023
+; RV64I-NEXT:    slli a0, a0, 63
 ; RV64I-NEXT:    slli a2, a2, 52
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    call __muldf3
diff --git a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
index 01aa25c15c8d2b..14193bf4cb169e 100644
--- a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
@@ -112,8 +112,8 @@ define double @fcopysign_fneg(double %a, double %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a2, a3
 ; RV32I-NEXT:    lui a3, 524288
-; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll
index b9e80dccd97b9a..798eac64e9fc26 100644
--- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll
@@ -42,11 +42,11 @@ define double @caller_double_inreg() nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    lui a0, 262236
+; RV32IFD-NEXT:    lui a2, 377487
+; RV32IFD-NEXT:    lui a3, 262364
 ; RV32IFD-NEXT:    addi a1, a0, 655
-; RV32IFD-NEXT:    lui a0, 377487
-; RV32IFD-NEXT:    addi a0, a0, 1475
-; RV32IFD-NEXT:    lui a2, 262364
-; RV32IFD-NEXT:    addi a3, a2, 655
+; RV32IFD-NEXT:    addi a0, a2, 1475
+; RV32IFD-NEXT:    addi a3, a3, 655
 ; RV32IFD-NEXT:    mv a2, a0
 ; RV32IFD-NEXT:    call callee_double_inreg
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -58,11 +58,11 @@ define double @caller_double_inreg() nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
 ; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    lui a0, 262236
+; RV32IZFINXZDINX-NEXT:    lui a2, 377487
+; RV32IZFINXZDINX-NEXT:    lui a3, 262364
 ; RV32IZFINXZDINX-NEXT:    addi a1, a0, 655
-; RV32IZFINXZDINX-NEXT:    lui a0, 377487
-; RV32IZFINXZDINX-NEXT:    addi a0, a0, 1475
-; RV32IZFINXZDINX-NEXT:    lui a2, 262364
-; RV32IZFINXZDINX-NEXT:    addi a3, a2, 655
+; RV32IZFINXZDINX-NEXT:    addi a0, a2, 1475
+; RV32IZFINXZDINX-NEXT:    addi a3, a3, 655
 ; RV32IZFINXZDINX-NEXT:    mv a2, a0
 ; RV32IZFINXZDINX-NEXT:    call callee_double_inreg
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -107,14 +107,14 @@ define double @caller_double_split_reg_stack() nounwind {
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lui a0, 262510
-; RV32IFD-NEXT:    addi a2, a0, 327
-; RV32IFD-NEXT:    lui a0, 262446
-; RV32IFD-NEXT:    addi a6, a0, 327
-; RV32IFD-NEXT:    lui a0, 713032
-; RV32IFD-NEXT:    addi a5, a0, -1311
+; RV32IFD-NEXT:    lui a2, 262510
+; RV32IFD-NEXT:    lui a3, 262446
+; RV32IFD-NEXT:    lui a4, 713032
 ; RV32IFD-NEXT:    li a0, 1
 ; RV32IFD-NEXT:    li a1, 2
+; RV32IFD-NEXT:    addi a2, a2, 327
+; RV32IFD-NEXT:    addi a6, a3, 327
+; RV32IFD-NEXT:    addi a5, a4, -1311
 ; RV32IFD-NEXT:    li a3, 3
 ; RV32IFD-NEXT:    sw a2, 0(sp)
 ; RV32IFD-NEXT:    li a2, 0
@@ -129,14 +129,14 @@ define double @caller_double_split_reg_stack() nounwind {
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
 ; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lui a0, 262510
-; RV32IZFINXZDINX-NEXT:    addi a2, a0, 327
-; RV32IZFINXZDINX-NEXT:    lui a0, 262446
-; RV32IZFINXZDINX-NEXT:    addi a6, a0, 327
-; RV32IZFINXZDINX-NEXT:    lui a0, 713032
-; RV32IZFINXZDINX-NEXT:    addi a5, a0, -1311
+; RV32IZFINXZDINX-NEXT:    lui a2, 262510
+; RV32IZFINXZDINX-NEXT:    lui a3, 262446
+; RV32IZFINXZDINX-NEXT:    lui a4, 713032
 ; RV32IZFINXZDINX-NEXT:    li a0, 1
 ; RV32IZFINXZDINX-NEXT:    li a1, 2
+; RV32IZFINXZDINX-NEXT:    addi a2, a2, 327
+; RV32IZFINXZDINX-NEXT:    addi a6, a3, 327
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1311
 ; RV32IZFINXZDINX-NEXT:    li a3, 3
 ; RV32IZFINXZDINX-NEXT:    sw a2, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    li a2, 0
@@ -180,16 +180,16 @@ define double @caller_double_stack() nounwind {
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -32
 ; RV32IFD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lui a0, 262510
-; RV32IFD-NEXT:    addi a1, a0, 327
-; RV32IFD-NEXT:    lui a0, 713032
-; RV32IFD-NEXT:    addi a3, a0, -1311
-; RV32IFD-NEXT:    lui a0, 262574
-; RV32IFD-NEXT:    addi a5, a0, 327
+; RV32IFD-NEXT:    lui a1, 262510
+; RV32IFD-NEXT:    lui a3, 713032
+; RV32IFD-NEXT:    lui a5, 262574
 ; RV32IFD-NEXT:    li a0, 1
 ; RV32IFD-NEXT:    li a2, 2
 ; RV32IFD-NEXT:    li a4, 3
 ; RV32IFD-NEXT:    li a6, 4
+; RV32IFD-NEXT:    addi a1, a1, 327
+; RV32IFD-NEXT:    addi a3, a3, -1311
+; RV32IFD-NEXT:    addi a5, a5, 327
 ; RV32IFD-NEXT:    sw a3, 0(sp)
 ; RV32IFD-NEXT:    sw a1, 4(sp)
 ; RV32IFD-NEXT:    sw a3, 8(sp)
@@ -207,16 +207,16 @@ define double @caller_double_stack() nounwind {
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -32
 ; RV32IZFINXZDINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lui a0, 262510
-; RV32IZFINXZDINX-NEXT:    addi a1, a0, 327
-; RV32IZFINXZDINX-NEXT:    lui a0, 713032
-; RV32IZFINXZDINX-NEXT:    addi a3, a0, -1311
-; RV32IZFINXZDINX-NEXT:    lui a0, 262574
-; RV32IZFINXZDINX-NEXT:    addi a5, a0, 327
+; RV32IZFINXZDINX-NEXT:    lui a1, 262510
+; RV32IZFINXZDINX-NEXT:    lui a3, 713032
+; RV32IZFINXZDINX-NEXT:    lui a5, 262574
 ; RV32IZFINXZDINX-NEXT:    li a0, 1
 ; RV32IZFINXZDINX-NEXT:    li a2, 2
 ; RV32IZFINXZDINX-NEXT:    li a4, 3
 ; RV32IZFINXZDINX-NEXT:    li a6, 4
+; RV32IZFINXZDINX-NEXT:    addi a1, a1, 327
+; RV32IZFINXZDINX-NEXT:    addi a3, a3, -1311
+; RV32IZFINXZDINX-NEXT:    addi a5, a5, 327
 ; RV32IZFINXZDINX-NEXT:    sw a3, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a1, 4(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a3, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index a72055ab2baa3d..c39085a80ddc10 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -692,7 +692,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB12_2
 ; RV32IFD-NEXT:  # %bb.1: # %start
@@ -700,19 +700,19 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:  .LBB12_2: # %start
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI12_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI12_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB12_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB12_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB12_4: # %start
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -740,29 +740,29 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI12_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI12_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI12_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
-; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a2, .LBB12_2
+; RV32IZFINXZDINX-NEXT:    fle.d a3, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
+; RV32IZFINXZDINX-NEXT:    lui a2, 524288
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB12_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1: # %start
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB12_2: # %start
 ; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI12_1)
 ; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI12_1)(a1)
 ; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI12_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB12_4
+; RV32IZFINXZDINX-NEXT:    flt.d a1, a6, s0
+; RV32IZFINXZDINX-NEXT:    beqz a1, .LBB12_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a2, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB12_4: # %start
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    feq.d a4, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a3, a3
 ; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a4, a4
+; RV32IZFINXZDINX-NEXT:    and a0, a3, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a4, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a4, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -800,14 +800,14 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __fixdfdi
-; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    lui a0, 524288
-; RV32I-NEXT:    bgez s3, .LBB12_2
+; RV32I-NEXT:    bgez s4, .LBB12_2
 ; RV32I-NEXT:  # %bb.1: # %start
 ; RV32I-NEXT:    lui s5, 524288
 ; RV32I-NEXT:  .LBB12_2: # %start
@@ -821,14 +821,14 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    call __unorddf2
 ; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    slti a1, s4, 0
+; RV32I-NEXT:    sgtz a2, s2
 ; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
 ; RV32I-NEXT:    and a1, a0, s5
-; RV32I-NEXT:    slti a2, s3, 0
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a2, a2, s4
-; RV32I-NEXT:    sgtz a3, s2
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    and a3, a3, s3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -954,10 +954,10 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    lui a2, %hi(.LCPI14_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI14_0)(a2)
 ; RV32IFD-NEXT:    and a0, s0, a0
+; RV32IFD-NEXT:    and a1, s0, a1
 ; RV32IFD-NEXT:    flt.d a2, fa5, fs0
 ; RV32IFD-NEXT:    neg a2, a2
 ; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a1, s0, a1
 ; RV32IFD-NEXT:    or a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -985,16 +985,16 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI14_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI14_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI14_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1037,8 +1037,8 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __fixunsdfdi
 ; RV32I-NEXT:    and a0, s3, a0
-; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    and a1, s3, a1
+; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    or a1, s2, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1669,8 +1669,8 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
 ; RV64IZFINXZDINX-NEXT:    li a1, -505
-; RV64IZFINXZDINX-NEXT:    slli a1, a1, 53
 ; RV64IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI26_0)
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 53
 ; RV64IZFINXZDINX-NEXT:    ld a2, %lo(.LCPI26_0)(a2)
 ; RV64IZFINXZDINX-NEXT:    fmax.d a1, a0, a1
 ; RV64IZFINXZDINX-NEXT:    feq.d a0, a0, a0
@@ -2044,11 +2044,11 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
 ; RV64IZFINXZDINX-NEXT:    feq.d a1, a0, a0
-; RV64IZFINXZDINX-NEXT:    neg a1, a1
 ; RV64IZFINXZDINX-NEXT:    li a2, -509
 ; RV64IZFINXZDINX-NEXT:    slli a2, a2, 53
 ; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    lui a2, 65919
+; RV64IZFINXZDINX-NEXT:    neg a1, a1
 ; RV64IZFINXZDINX-NEXT:    slli a2, a2, 34
 ; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    fcvt.l.d a0, a0, rtz
diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll
index 2294171d95ab2c..155827ad069cc6 100644
--- a/llvm/test/CodeGen/RISCV/double-imm.ll
+++ b/llvm/test/CodeGen/RISCV/double-imm.ll
@@ -24,8 +24,8 @@ define double @double_imm() nounwind {
 ; CHECKRV32ZDINX-LABEL: double_imm:
 ; CHECKRV32ZDINX:       # %bb.0:
 ; CHECKRV32ZDINX-NEXT:    lui a0, 345155
-; CHECKRV32ZDINX-NEXT:    addi a0, a0, -744
 ; CHECKRV32ZDINX-NEXT:    lui a1, 262290
+; CHECKRV32ZDINX-NEXT:    addi a0, a0, -744
 ; CHECKRV32ZDINX-NEXT:    addi a1, a1, 507
 ; CHECKRV32ZDINX-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
index a65fd09613424c..3ef128ed6d4cdb 100644
--- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
@@ -817,8 +817,8 @@ define double @copysign_f64(double %a, double %b) nounwind {
 ; RV32I-LABEL: copysign_f64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -826,8 +826,8 @@ define double @copysign_f64(double %a, double %b) nounwind {
 ; RV64I-LABEL: copysign_f64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 63
-; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1535,8 +1535,8 @@ define i1 @isnan_d_fpclass(double %x) {
 ; RV64I-LABEL: isnan_d_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    li a1, 2047
+; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    slli a1, a1, 52
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/double-previous-failure.ll b/llvm/test/CodeGen/RISCV/double-previous-failure.ll
index c169b1099b273a..c5a7ee79364c65 100644
--- a/llvm/test/CodeGen/RISCV/double-previous-failure.ll
+++ b/llvm/test/CodeGen/RISCV/double-previous-failure.ll
@@ -28,8 +28,8 @@ define i32 @main() nounwind {
 ; RV32IFD-NEXT:    call test
 ; RV32IFD-NEXT:    sw a0, 0(sp)
 ; RV32IFD-NEXT:    sw a1, 4(sp)
-; RV32IFD-NEXT:    fld fa5, 0(sp)
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32IFD-NEXT:    fld fa5, 0(sp)
 ; RV32IFD-NEXT:    fld fa4, %lo(.LCPI1_0)(a0)
 ; RV32IFD-NEXT:    flt.d a0, fa5, fa4
 ; RV32IFD-NEXT:    bnez a0, .LBB1_3
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index 0839f61b2d7936..cd87f2d2301d7c 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -53,7 +53,7 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB1_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -61,19 +61,19 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB1_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI1_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI1_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB1_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB1_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB1_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -100,31 +100,31 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI1_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI1_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB1_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB1_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB1_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -186,15 +186,15 @@ define i64 @test_floor_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call floor
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI3_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -223,16 +223,16 @@ define i64 @test_floor_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI3_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI3_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI3_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -297,7 +297,7 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB5_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -305,19 +305,19 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB5_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI5_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB5_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB5_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB5_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -344,31 +344,31 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI5_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI5_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB5_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB5_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB5_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -430,15 +430,15 @@ define i64 @test_ceil_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call ceil
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI7_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI7_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -467,16 +467,16 @@ define i64 @test_ceil_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI7_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI7_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI7_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -541,7 +541,7 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB9_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -549,19 +549,19 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB9_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI9_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI9_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB9_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB9_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB9_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -588,31 +588,31 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI9_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI9_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB9_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB9_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB9_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -674,15 +674,15 @@ define i64 @test_trunc_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call trunc
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI11_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -711,16 +711,16 @@ define i64 @test_trunc_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI11_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI11_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI11_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -785,7 +785,7 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB13_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -793,19 +793,19 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB13_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI13_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI13_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB13_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB13_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB13_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -832,31 +832,31 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI13_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI13_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB13_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB13_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB13_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -918,15 +918,15 @@ define i64 @test_round_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call round
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI15_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI15_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -955,16 +955,16 @@ define i64 @test_round_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI15_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI15_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI15_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1029,7 +1029,7 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB17_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -1037,19 +1037,19 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB17_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI17_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI17_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB17_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB17_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB17_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1076,31 +1076,31 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI17_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI17_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB17_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB17_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB17_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1162,15 +1162,15 @@ define i64 @test_roundeven_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call roundeven
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI19_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1199,16 +1199,16 @@ define i64 @test_roundeven_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI19_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI19_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI19_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1273,7 +1273,7 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB21_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -1281,19 +1281,19 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB21_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI21_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI21_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB21_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB21_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB21_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1320,31 +1320,31 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI21_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI21_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB21_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB21_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB21_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1406,15 +1406,15 @@ define i64 @test_rint_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call rint
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI23_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1443,16 +1443,16 @@ define i64 @test_rint_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI23_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI23_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI23_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll
index 25228b21ef0554..cecdd77a079e42 100644
--- a/llvm/test/CodeGen/RISCV/double_reduct.ll
+++ b/llvm/test/CodeGen/RISCV/double_reduct.ll
@@ -25,14 +25,14 @@ define float @fmul_f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 2
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 2
+; CHECK-NEXT:    vfmul.vv v9, v9, v10
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vfmul.vv v9, v9, v10
 ; CHECK-NEXT:    vfmv.f.s fa5, v8
-; CHECK-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-NEXT:    vfmul.vv v8, v9, v8
-; CHECK-NEXT:    vrgather.vi v9, v8, 1
-; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    fmul.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -130,14 +130,14 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    vslidedown.vi v10, v9, 2
+; RV32-NEXT:    vmul.vv v9, v9, v10
 ; RV32-NEXT:    vrgather.vi v10, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    vrgather.vi v10, v9, 1
+; RV32-NEXT:    vmul.vv v9, v9, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v8, v9, 2
-; RV32-NEXT:    vmul.vv v8, v9, v8
-; RV32-NEXT:    vrgather.vi v9, v8, 1
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    ret
 ;
@@ -146,14 +146,14 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64-NEXT:    vmul.vv v8, v8, v10
+; RV64-NEXT:    vslidedown.vi v10, v9, 2
+; RV64-NEXT:    vmul.vv v9, v9, v10
 ; RV64-NEXT:    vrgather.vi v10, v8, 1
 ; RV64-NEXT:    vmul.vv v8, v8, v10
+; RV64-NEXT:    vrgather.vi v10, v9, 1
+; RV64-NEXT:    vmul.vv v9, v9, v10
 ; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v8, v9, 2
-; RV64-NEXT:    vmul.vv v8, v9, v8
-; RV64-NEXT:    vrgather.vi v9, v8, 1
-; RV64-NEXT:    vmul.vv v8, v8, v9
-; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    mulw a0, a0, a1
 ; RV64-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index cfde765873386d..5a38ec36068f93 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -49,10 +49,10 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    vs2r.v v14, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_40)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_40)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 1048572
diff --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll
index bf500d1a2adb39..57b3423da69a6b 100644
--- a/llvm/test/CodeGen/RISCV/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/float-arith.ll
@@ -195,8 +195,8 @@ define float @fsgnj_s(float %a, float %b) nounwind {
 ; RV32I-LABEL: fsgnj_s:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -204,8 +204,8 @@ define float @fsgnj_s(float %a, float %b) nounwind {
 ; RV64I-LABEL: fsgnj_s:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -284,8 +284,8 @@ define float @fsgnjn_s(float %a, float %b) nounwind {
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    not a0, a0
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli s0, s0, 1
+; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    srli s0, s0, 1
 ; RV32I-NEXT:    or a0, s0, a0
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -302,8 +302,8 @@ define float @fsgnjn_s(float %a, float %b) nounwind {
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli s0, s0, 33
+; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    srli s0, s0, 33
 ; RV64I-NEXT:    or a0, s0, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
index 86f6f079243c26..aaeb1b7c0b1fb1 100644
--- a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
@@ -107,8 +107,8 @@ define float @fcopysign_fneg(float %a, float %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -131,8 +131,8 @@ define float @fcopysign_fneg(float %a, float %b) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    not a1, a1
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 5e73f862bca57c..fc866d71a3a709 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -629,7 +629,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fa0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB12_2
 ; RV32IF-NEXT:  # %bb.1: # %start
@@ -637,19 +637,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:  .LBB12_2: # %start
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI12_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB12_4
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB12_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB12_4: # %start
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -676,7 +676,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB12_2
 ; RV32IZFINX-NEXT:  # %bb.1: # %start
@@ -684,19 +684,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:  .LBB12_2: # %start
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB12_4
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB12_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB12_4: # %start
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -722,40 +722,40 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a1, 913408
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call __fixsfdi
 ; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __fixsfdi
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui s5, 524288
-; RV32I-NEXT:    bgez s1, .LBB12_2
+; RV32I-NEXT:    bgez s2, .LBB12_2
 ; RV32I-NEXT:  # %bb.1: # %start
 ; RV32I-NEXT:    lui s3, 524288
 ; RV32I-NEXT:  .LBB12_2: # %start
 ; RV32I-NEXT:    lui a1, 389120
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    blez a0, .LBB12_4
 ; RV32I-NEXT:  # %bb.3: # %start
 ; RV32I-NEXT:    addi s3, s5, -1
 ; RV32I-NEXT:  .LBB12_4: # %start
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __unordsf2
 ; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    slti a1, s2, 0
+; RV32I-NEXT:    sgtz a2, s4
 ; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
 ; RV32I-NEXT:    and a1, a0, s3
-; RV32I-NEXT:    slti a2, s1, 0
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a2, a2, s2
-; RV32I-NEXT:    sgtz a3, s4
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    and a3, a3, s0
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -879,10 +879,10 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI14_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI14_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -912,11 +912,11 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -955,10 +955,10 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg a1, a0
-; RV32I-NEXT:    or a0, a1, s3
-; RV32I-NEXT:    and a2, s2, s1
-; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    and a1, s2, s1
+; RV32I-NEXT:    neg a2, a0
+; RV32I-NEXT:    or a0, a2, s3
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1445,10 +1445,10 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV32IZFINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFINX:       # %bb.0: # %start
 ; RV32IZFINX-NEXT:    feq.s a1, a0, a0
-; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    lui a2, 815104
 ; RV32IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV32IZFINX-NEXT:    lui a2, 290816
+; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -512
 ; RV32IZFINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZFINX-NEXT:    fcvt.w.s a0, a0, rtz
@@ -1458,10 +1458,10 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV64IZFINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINX:       # %bb.0: # %start
 ; RV64IZFINX-NEXT:    feq.s a1, a0, a0
-; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    lui a2, 815104
 ; RV64IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV64IZFINX-NEXT:    lui a2, 290816
+; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    addiw a2, a2, -512
 ; RV64IZFINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZFINX-NEXT:    fcvt.l.s a0, a0, rtz
@@ -1763,11 +1763,11 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_w_s_sat_i8:
 ; RV32IF:       # %bb.0: # %start
 ; RV32IF-NEXT:    feq.s a0, fa0, fa0
-; RV32IF-NEXT:    neg a0, a0
 ; RV32IF-NEXT:    lui a1, 798720
 ; RV32IF-NEXT:    fmv.w.x fa5, a1
-; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    lui a1, 274400
+; RV32IF-NEXT:    neg a0, a0
+; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    fmv.w.x fa4, a1
 ; RV32IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IF-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -1777,11 +1777,11 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV64IF-LABEL: fcvt_w_s_sat_i8:
 ; RV64IF:       # %bb.0: # %start
 ; RV64IF-NEXT:    feq.s a0, fa0, fa0
-; RV64IF-NEXT:    neg a0, a0
 ; RV64IF-NEXT:    lui a1, 798720
 ; RV64IF-NEXT:    fmv.w.x fa5, a1
-; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    lui a1, 274400
+; RV64IF-NEXT:    neg a0, a0
+; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    fmv.w.x fa4, a1
 ; RV64IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IF-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -1791,8 +1791,8 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV32IZFINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZFINX:       # %bb.0: # %start
 ; RV32IZFINX-NEXT:    feq.s a1, a0, a0
-; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    lui a2, 798720
+; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV32IZFINX-NEXT:    lui a2, 274400
 ; RV32IZFINX-NEXT:    fmin.s a0, a0, a2
@@ -1803,8 +1803,8 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV64IZFINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFINX:       # %bb.0: # %start
 ; RV64IZFINX-NEXT:    feq.s a1, a0, a0
-; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    lui a2, 798720
+; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV64IZFINX-NEXT:    lui a2, 274400
 ; RV64IZFINX-NEXT:    fmin.s a0, a0, a2
@@ -1943,8 +1943,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_wu_s_sat_i8:
 ; RV32IF:       # %bb.0: # %start
 ; RV32IF-NEXT:    fmv.w.x fa5, zero
-; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    lui a0, 276464
+; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    fmv.w.x fa4, a0
 ; RV32IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IF-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1953,8 +1953,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind {
 ; RV64IF-LABEL: fcvt_wu_s_sat_i8:
 ; RV64IF:       # %bb.0: # %start
 ; RV64IF-NEXT:    fmv.w.x fa5, zero
-; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    lui a0, 276464
+; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    fmv.w.x fa4, a0
 ; RV64IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IF-NEXT:    fcvt.lu.s a0, fa5, rtz
diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
index e154f3361a1216..37381aeeb2a0f3 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
@@ -807,8 +807,8 @@ define float @copysign_f32(float %a, float %b) nounwind {
 ; RV32I-LABEL: copysign_f32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -816,8 +816,8 @@ define float @copysign_f32(float %a, float %b) nounwind {
 ; RV64I-LABEL: copysign_f32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1603,54 +1603,54 @@ define i1 @fpclass(float %x) {
 ; RV32I-LABEL: fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 1
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    lui a3, 2048
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    sltu a2, a2, a3
+; RV32I-NEXT:    lui a2, 2048
 ; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    and a2, a2, a0
-; RV32I-NEXT:    seqz a3, a1
-; RV32I-NEXT:    lui a4, 522240
-; RV32I-NEXT:    xor a5, a1, a4
+; RV32I-NEXT:    lui a3, 522240
+; RV32I-NEXT:    lui a4, 1046528
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    addi a5, a1, -1
+; RV32I-NEXT:    sltu a2, a5, a2
+; RV32I-NEXT:    xor a5, a1, a3
+; RV32I-NEXT:    slt a3, a3, a1
+; RV32I-NEXT:    add a4, a1, a4
+; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    seqz a5, a5
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    slt a3, a4, a1
-; RV32I-NEXT:    or a2, a2, a3
-; RV32I-NEXT:    lui a3, 1046528
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sltiu a1, a1, 127
-; RV32I-NEXT:    and a0, a1, a0
-; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    and a2, a2, a0
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    sltiu a4, a4, 127
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a0, a4, a0
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    lui a2, 2048
+; RV64I-NEXT:    lui a3, 522240
+; RV64I-NEXT:    lui a4, 1046528
 ; RV64I-NEXT:    srli a0, a0, 33
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    lui a3, 2048
-; RV64I-NEXT:    addiw a3, a3, -1
-; RV64I-NEXT:    sltu a2, a2, a3
+; RV64I-NEXT:    addiw a2, a2, -1
 ; RV64I-NEXT:    slti a1, a1, 0
-; RV64I-NEXT:    and a2, a2, a1
-; RV64I-NEXT:    seqz a3, a0
-; RV64I-NEXT:    lui a4, 522240
-; RV64I-NEXT:    xor a5, a0, a4
+; RV64I-NEXT:    addi a5, a0, -1
+; RV64I-NEXT:    sltu a2, a5, a2
+; RV64I-NEXT:    xor a5, a0, a3
+; RV64I-NEXT:    slt a3, a3, a0
+; RV64I-NEXT:    add a4, a0, a4
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    seqz a5, a5
-; RV64I-NEXT:    or a3, a3, a5
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    slt a3, a4, a0
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    lui a3, 1046528
-; RV64I-NEXT:    add a0, a0, a3
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    sltiu a0, a0, 127
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    srliw a4, a4, 24
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    sltiu a4, a4, 127
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
   %cmp = call i1 @llvm.is.fpclass.f32(float %x, i32 639)
   ret i1 %cmp
@@ -1732,8 +1732,8 @@ define i1 @isqnan_fpclass(float %x) {
 ; RV32I-LABEL: isqnan_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    lui a1, 523264
+; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    slt a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -1741,8 +1741,8 @@ define i1 @isqnan_fpclass(float %x) {
 ; RV64I-LABEL: isqnan_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a1, 523264
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    addiw a1, a1, -1
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1782,10 +1782,10 @@ define i1 @issnan_fpclass(float %x) {
 ; RV32I-LABEL: issnan_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    lui a1, 523264
-; RV32I-NEXT:    slt a1, a0, a1
 ; RV32I-NEXT:    lui a2, 522240
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    slt a1, a0, a1
 ; RV32I-NEXT:    slt a0, a2, a0
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -1793,10 +1793,10 @@ define i1 @issnan_fpclass(float %x) {
 ; RV64I-LABEL: issnan_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a1, 523264
-; RV64I-NEXT:    slt a1, a0, a1
 ; RV64I-NEXT:    lui a2, 522240
+; RV64I-NEXT:    srli a0, a0, 33
+; RV64I-NEXT:    slt a1, a0, a1
 ; RV64I-NEXT:    slt a0, a2, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2068,8 +2068,8 @@ define i1 @isnegfinite_fpclass(float %x) {
 ; RV32I-LABEL: isnegfinite_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 1
-; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    lui a2, 522240
+; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    slt a1, a1, a2
 ; RV32I-NEXT:    slti a0, a0, 0
 ; RV32I-NEXT:    and a0, a1, a0
@@ -2079,8 +2079,8 @@ define i1 @isnegfinite_fpclass(float %x) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a2, 522240
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    slt a0, a0, a2
 ; RV64I-NEXT:    slti a1, a1, 0
 ; RV64I-NEXT:    and a0, a0, a1
@@ -2121,8 +2121,8 @@ define i1 @isnotfinite_fpclass(float %x) {
 ; RV32I-LABEL: isnotfinite_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    lui a1, 522240
+; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    slt a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -2130,8 +2130,8 @@ define i1 @isnotfinite_fpclass(float %x) {
 ; RV64I-LABEL: isnotfinite_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a1, 522240
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    addiw a1, a1, -1
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 198b18c75272a9..809cc31abe612f 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -54,7 +54,7 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB1_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -62,19 +62,19 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB1_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB1_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB1_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB1_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -110,7 +110,7 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -118,19 +118,19 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB1_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB1_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB1_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -199,10 +199,10 @@ define i64 @test_floor_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI3_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI3_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -241,11 +241,11 @@ define i64 @test_floor_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -312,7 +312,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB5_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -320,19 +320,19 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB5_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI5_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB5_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB5_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB5_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -368,7 +368,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -376,19 +376,19 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB5_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB5_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB5_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -457,10 +457,10 @@ define i64 @test_ceil_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI7_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI7_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -499,11 +499,11 @@ define i64 @test_ceil_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -570,7 +570,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB9_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -578,19 +578,19 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB9_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI9_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB9_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB9_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB9_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -626,7 +626,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -634,19 +634,19 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB9_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB9_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB9_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -715,10 +715,10 @@ define i64 @test_trunc_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI11_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI11_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -757,11 +757,11 @@ define i64 @test_trunc_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -828,7 +828,7 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB13_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -836,19 +836,19 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB13_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI13_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB13_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB13_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB13_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -884,7 +884,7 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -892,19 +892,19 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB13_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB13_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB13_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -973,10 +973,10 @@ define i64 @test_round_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI15_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI15_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1015,11 +1015,11 @@ define i64 @test_round_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1086,7 +1086,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB17_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -1094,19 +1094,19 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB17_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB17_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB17_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB17_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1142,7 +1142,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -1150,19 +1150,19 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB17_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB17_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB17_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1231,10 +1231,10 @@ define i64 @test_roundeven_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI19_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI19_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1273,11 +1273,11 @@ define i64 @test_roundeven_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1344,7 +1344,7 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB21_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -1352,19 +1352,19 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB21_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI21_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB21_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB21_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB21_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1400,7 +1400,7 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -1408,19 +1408,19 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB21_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB21_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB21_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1489,10 +1489,10 @@ define i64 @test_rint_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI23_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI23_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1531,11 +1531,11 @@ define i64 @test_rint_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index a5fc78445066f0..b8dc7804c49082 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -929,18 +929,18 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call f
-; RV32I-NEXT:    lw a0, 12(s7)
-; RV32I-NEXT:    lw a1, 8(s7)
-; RV32I-NEXT:    add a0, a0, s4
+; RV32I-NEXT:    lw a0, 8(s7)
+; RV32I-NEXT:    lw a1, 12(s7)
 ; RV32I-NEXT:    addi s5, s5, 1
 ; RV32I-NEXT:    seqz a2, s5
 ; RV32I-NEXT:    add s6, s6, a2
 ; RV32I-NEXT:    xor a2, s5, s2
+; RV32I-NEXT:    add a1, a1, s4
 ; RV32I-NEXT:    xor a3, s6, s1
 ; RV32I-NEXT:    or a2, a2, a3
-; RV32I-NEXT:    add s3, a1, s3
-; RV32I-NEXT:    sltu s4, s3, a1
-; RV32I-NEXT:    add s4, a0, s4
+; RV32I-NEXT:    add s3, a0, s3
+; RV32I-NEXT:    sltu s4, s3, a0
+; RV32I-NEXT:    add s4, a1, s4
 ; RV32I-NEXT:    bnez a2, .LBB20_5
 ; RV32I-NEXT:  .LBB20_6: # %for.cond.cleanup
 ; RV32I-NEXT:    mv a0, s3
@@ -994,18 +994,18 @@ define i64 @fold_addi_from_different_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-MEDIUM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-MEDIUM-NEXT:    mv a0, s0
 ; RV32I-MEDIUM-NEXT:    call f
-; RV32I-MEDIUM-NEXT:    lw a0, 12(s7)
-; RV32I-MEDIUM-NEXT:    lw a1, 8(s7)
-; RV32I-MEDIUM-NEXT:    add a0, a0, s4
+; RV32I-MEDIUM-NEXT:    lw a0, 8(s7)
+; RV32I-MEDIUM-NEXT:    lw a1, 12(s7)
 ; RV32I-MEDIUM-NEXT:    addi s5, s5, 1
 ; RV32I-MEDIUM-NEXT:    seqz a2, s5
 ; RV32I-MEDIUM-NEXT:    add s6, s6, a2
 ; RV32I-MEDIUM-NEXT:    xor a2, s5, s2
+; RV32I-MEDIUM-NEXT:    add a1, a1, s4
 ; RV32I-MEDIUM-NEXT:    xor a3, s6, s1
 ; RV32I-MEDIUM-NEXT:    or a2, a2, a3
-; RV32I-MEDIUM-NEXT:    add s3, a1, s3
-; RV32I-MEDIUM-NEXT:    sltu s4, s3, a1
-; RV32I-MEDIUM-NEXT:    add s4, a0, s4
+; RV32I-MEDIUM-NEXT:    add s3, a0, s3
+; RV32I-MEDIUM-NEXT:    sltu s4, s3, a0
+; RV32I-MEDIUM-NEXT:    add s4, a1, s4
 ; RV32I-MEDIUM-NEXT:    bnez a2, .LBB20_5
 ; RV32I-MEDIUM-NEXT:  .LBB20_6: # %for.cond.cleanup
 ; RV32I-MEDIUM-NEXT:    mv a0, s3
diff --git a/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
index 1512db87b9311e..2036e7c7adfa81 100644
--- a/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
+++ b/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
@@ -33,8 +33,8 @@ define i64 @fold_binop_into_select_2(i1 %c, i64 %x) {
 ; CHECK-LABEL: fold_binop_into_select_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a2, 2
-; CHECK-NEXT:    sub a2, a2, a1
 ; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    sub a2, a2, a1
 ; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index 133d735a46ed7f..e7719dc70660bc 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -4531,29 +4531,29 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a1
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    lw a2, 4(s0)
-; RV32-NEXT:    lw a3, 8(s0)
-; RV32-NEXT:    lw a4, 12(s0)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    lw a2, 12(s0)
 ; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    addi a0, a1, 1
-; RV32-NEXT:    seqz a5, a0
-; RV32-NEXT:    add a5, a2, a5
-; RV32-NEXT:    or a6, a0, a5
-; RV32-NEXT:    seqz a6, a6
-; RV32-NEXT:    add a6, a3, a6
-; RV32-NEXT:    sltu a7, a6, a3
-; RV32-NEXT:    add a7, a4, a7
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a2, 20(sp)
-; RV32-NEXT:    sw a3, 24(sp)
-; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    addi a0, a4, 1
+; RV32-NEXT:    sw a4, 16(sp)
+; RV32-NEXT:    sw a3, 20(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a2, 28(sp)
+; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    or a4, a0, a3
+; RV32-NEXT:    seqz a4, a4
+; RV32-NEXT:    add a4, a1, a4
+; RV32-NEXT:    sltu a1, a4, a1
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    sw a0, 0(sp)
-; RV32-NEXT:    sw a5, 4(sp)
-; RV32-NEXT:    sw a6, 8(sp)
-; RV32-NEXT:    sw a7, 12(sp)
+; RV32-NEXT:    sw a3, 4(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    li a0, 16
 ; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    mv a3, sp
@@ -4561,16 +4561,16 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a1, s0
 ; RV32-NEXT:    call __atomic_compare_exchange
-; RV32-NEXT:    lw a1, 16(sp)
-; RV32-NEXT:    lw a2, 20(sp)
-; RV32-NEXT:    lw a3, 24(sp)
-; RV32-NEXT:    lw a4, 28(sp)
+; RV32-NEXT:    lw a4, 16(sp)
+; RV32-NEXT:    lw a3, 20(sp)
+; RV32-NEXT:    lw a1, 24(sp)
+; RV32-NEXT:    lw a2, 28(sp)
 ; RV32-NEXT:    beqz a0, .LBB62_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    sw a1, 0(s1)
-; RV32-NEXT:    sw a2, 4(s1)
-; RV32-NEXT:    sw a3, 8(s1)
-; RV32-NEXT:    sw a4, 12(s1)
+; RV32-NEXT:    sw a4, 0(s1)
+; RV32-NEXT:    sw a3, 4(s1)
+; RV32-NEXT:    sw a1, 8(s1)
+; RV32-NEXT:    sw a2, 12(s1)
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/fp128.ll b/llvm/test/CodeGen/RISCV/fp128.ll
index 0bde85b54e5d15..581ee5cd2304df 100644
--- a/llvm/test/CodeGen/RISCV/fp128.ll
+++ b/llvm/test/CodeGen/RISCV/fp128.ll
@@ -14,19 +14,19 @@ define i32 @test_load_and_cmp() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -48
 ; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(x)
+; RV32I-NEXT:    lui a1, %hi(y)
 ; RV32I-NEXT:    lw a2, %lo(x)(a0)
 ; RV32I-NEXT:    lw a3, %lo(x+4)(a0)
 ; RV32I-NEXT:    lw a4, %lo(x+8)(a0)
 ; RV32I-NEXT:    lw a5, %lo(x+12)(a0)
-; RV32I-NEXT:    lui a0, %hi(y)
-; RV32I-NEXT:    lw a1, %lo(y)(a0)
-; RV32I-NEXT:    lw a6, %lo(y+4)(a0)
-; RV32I-NEXT:    lw a7, %lo(y+8)(a0)
-; RV32I-NEXT:    lw a0, %lo(y+12)(a0)
-; RV32I-NEXT:    sw a1, 8(sp)
+; RV32I-NEXT:    lw a0, %lo(y)(a1)
+; RV32I-NEXT:    lw a6, %lo(y+4)(a1)
+; RV32I-NEXT:    lw a7, %lo(y+8)(a1)
+; RV32I-NEXT:    lw a1, %lo(y+12)(a1)
+; RV32I-NEXT:    sw a0, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a1, 20(sp)
 ; RV32I-NEXT:    addi a0, sp, 24
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    sw a2, 24(sp)
@@ -51,19 +51,19 @@ define i32 @test_add_and_fptosi() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -80
 ; RV32I-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(x)
+; RV32I-NEXT:    lui a1, %hi(y)
 ; RV32I-NEXT:    lw a3, %lo(x)(a0)
 ; RV32I-NEXT:    lw a4, %lo(x+4)(a0)
 ; RV32I-NEXT:    lw a5, %lo(x+8)(a0)
 ; RV32I-NEXT:    lw a6, %lo(x+12)(a0)
-; RV32I-NEXT:    lui a0, %hi(y)
-; RV32I-NEXT:    lw a1, %lo(y)(a0)
-; RV32I-NEXT:    lw a2, %lo(y+4)(a0)
-; RV32I-NEXT:    lw a7, %lo(y+8)(a0)
-; RV32I-NEXT:    lw a0, %lo(y+12)(a0)
-; RV32I-NEXT:    sw a1, 24(sp)
+; RV32I-NEXT:    lw a0, %lo(y)(a1)
+; RV32I-NEXT:    lw a2, %lo(y+4)(a1)
+; RV32I-NEXT:    lw a7, %lo(y+8)(a1)
+; RV32I-NEXT:    lw a1, %lo(y+12)(a1)
+; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a2, 28(sp)
 ; RV32I-NEXT:    sw a7, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a1, 36(sp)
 ; RV32I-NEXT:    addi a0, sp, 56
 ; RV32I-NEXT:    addi a1, sp, 40
 ; RV32I-NEXT:    addi a2, sp, 24
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index bbdfda5c1e10d5..c5c3b199447a9a 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1282,8 +1282,8 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    or a4, a1, a0
-; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    seqz a0, a0
 ; RV32IF-NEXT:    addi a0, a0, -1
@@ -1326,8 +1326,8 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    or a4, a1, a0
-; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    seqz a0, a0
 ; RV32IFD-NEXT:    addi a0, a0, -1
@@ -1592,8 +1592,8 @@ define i64 @utest_f32i64(float %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -1853,8 +1853,8 @@ define i64 @utesth_f16i64(half %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -2168,8 +2168,8 @@ define i32 @ustest_f64i32_mm(double %x) {
 ; RV32IF-NEXT:    slti a2, a1, 1
 ; RV32IF-NEXT:  .LBB29_3: # %entry
 ; RV32IF-NEXT:    addi a3, a2, -1
-; RV32IF-NEXT:    or a0, a3, a0
 ; RV32IF-NEXT:    neg a2, a2
+; RV32IF-NEXT:    or a0, a3, a0
 ; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    slti a1, a1, 0
 ; RV32IF-NEXT:    addi a1, a1, -1
@@ -2459,8 +2459,8 @@ define i32 @ustest_f16i32_mm(half %x) {
 ; RV32-NEXT:    slti a2, a1, 1
 ; RV32-NEXT:  .LBB35_3: # %entry
 ; RV32-NEXT:    addi a3, a2, -1
-; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    slti a1, a1, 0
 ; RV32-NEXT:    addi a1, a1, -1
@@ -3216,8 +3216,8 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    or a4, a1, a0
-; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    seqz a0, a0
 ; RV32IF-NEXT:    addi a0, a0, -1
@@ -3260,8 +3260,8 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    or a4, a1, a0
-; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    seqz a0, a0
 ; RV32IFD-NEXT:    addi a0, a0, -1
@@ -3335,11 +3335,11 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB47_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
+; RV64-NEXT:    slti a2, a2, 0
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    slti a1, a2, 0
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    addi sp, sp, 16
@@ -3484,8 +3484,8 @@ define i64 @utest_f32i64_mm(float %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -3573,11 +3573,11 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB50_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
+; RV64-NEXT:    slti a2, a2, 0
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    slti a1, a2, 0
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    addi sp, sp, 16
@@ -3719,8 +3719,8 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -3811,11 +3811,11 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB53_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
+; RV64-NEXT:    slti a2, a2, 0
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    slti a1, a2, 0
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/fpenv.ll b/llvm/test/CodeGen/RISCV/fpenv.ll
index 48aec0b70b0d40..895effb4ce49bb 100644
--- a/llvm/test/CodeGen/RISCV/fpenv.ll
+++ b/llvm/test/CodeGen/RISCV/fpenv.ll
@@ -6,8 +6,8 @@ define i32 @func_01() {
 ; RV32IF-LABEL: func_01:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    frrm a0
-; RV32IF-NEXT:    slli a0, a0, 2
 ; RV32IF-NEXT:    lui a1, 66
+; RV32IF-NEXT:    slli a0, a0, 2
 ; RV32IF-NEXT:    addi a1, a1, 769
 ; RV32IF-NEXT:    srl a0, a1, a0
 ; RV32IF-NEXT:    andi a0, a0, 7
@@ -16,8 +16,8 @@ define i32 @func_01() {
 ; RV64IF-LABEL: func_01:
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    frrm a0
-; RV64IF-NEXT:    slli a0, a0, 2
 ; RV64IF-NEXT:    lui a1, 66
+; RV64IF-NEXT:    slli a0, a0, 2
 ; RV64IF-NEXT:    addiw a1, a1, 769
 ; RV64IF-NEXT:    srl a0, a1, a0
 ; RV64IF-NEXT:    andi a0, a0, 7
@@ -40,8 +40,8 @@ define void @func_02(i32 %rm) {
 ; RV64IF-LABEL: func_02:
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    slli a0, a0, 32
-; RV64IF-NEXT:    srli a0, a0, 30
 ; RV64IF-NEXT:    lui a1, 66
+; RV64IF-NEXT:    srli a0, a0, 30
 ; RV64IF-NEXT:    addiw a1, a1, 769
 ; RV64IF-NEXT:    srl a0, a1, a0
 ; RV64IF-NEXT:    andi a0, a0, 7
diff --git a/llvm/test/CodeGen/RISCV/ghccc-rv32.ll b/llvm/test/CodeGen/RISCV/ghccc-rv32.ll
index 0f9511125adbae..c4c14c6cb8726e 100644
--- a/llvm/test/CodeGen/RISCV/ghccc-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/ghccc-rv32.ll
@@ -33,50 +33,50 @@ define ghccc void @foo() nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(d6)
+; CHECK-NEXT:    lui a1, %hi(d5)
+; CHECK-NEXT:    lui a2, %hi(d4)
+; CHECK-NEXT:    lui a3, %hi(d3)
+; CHECK-NEXT:    lui a4, %hi(d2)
+; CHECK-NEXT:    lui a5, %hi(d1)
+; CHECK-NEXT:    lui a6, %hi(f6)
+; CHECK-NEXT:    lui a7, %hi(f5)
+; CHECK-NEXT:    lui t0, %hi(f4)
+; CHECK-NEXT:    lui t1, %hi(f3)
+; CHECK-NEXT:    lui t2, %hi(f2)
 ; CHECK-NEXT:    fld fs11, %lo(d6)(a0)
-; CHECK-NEXT:    lui a0, %hi(d5)
-; CHECK-NEXT:    fld fs10, %lo(d5)(a0)
-; CHECK-NEXT:    lui a0, %hi(d4)
-; CHECK-NEXT:    fld fs9, %lo(d4)(a0)
-; CHECK-NEXT:    lui a0, %hi(d3)
-; CHECK-NEXT:    fld fs8, %lo(d3)(a0)
-; CHECK-NEXT:    lui a0, %hi(d2)
-; CHECK-NEXT:    fld fs7, %lo(d2)(a0)
-; CHECK-NEXT:    lui a0, %hi(d1)
-; CHECK-NEXT:    fld fs6, %lo(d1)(a0)
-; CHECK-NEXT:    lui a0, %hi(f6)
-; CHECK-NEXT:    flw fs5, %lo(f6)(a0)
-; CHECK-NEXT:    lui a0, %hi(f5)
-; CHECK-NEXT:    flw fs4, %lo(f5)(a0)
-; CHECK-NEXT:    lui a0, %hi(f4)
-; CHECK-NEXT:    flw fs3, %lo(f4)(a0)
-; CHECK-NEXT:    lui a0, %hi(f3)
-; CHECK-NEXT:    flw fs2, %lo(f3)(a0)
-; CHECK-NEXT:    lui a0, %hi(f2)
-; CHECK-NEXT:    flw fs1, %lo(f2)(a0)
 ; CHECK-NEXT:    lui a0, %hi(f1)
+; CHECK-NEXT:    fld fs10, %lo(d5)(a1)
+; CHECK-NEXT:    lui a1, %hi(splim)
+; CHECK-NEXT:    fld fs9, %lo(d4)(a2)
+; CHECK-NEXT:    lui a2, %hi(r7)
+; CHECK-NEXT:    fld fs8, %lo(d3)(a3)
+; CHECK-NEXT:    lui a3, %hi(r6)
+; CHECK-NEXT:    fld fs7, %lo(d2)(a4)
+; CHECK-NEXT:    lui a4, %hi(r5)
+; CHECK-NEXT:    fld fs6, %lo(d1)(a5)
+; CHECK-NEXT:    lui a5, %hi(r4)
+; CHECK-NEXT:    flw fs5, %lo(f6)(a6)
+; CHECK-NEXT:    lui a6, %hi(r3)
+; CHECK-NEXT:    flw fs4, %lo(f5)(a7)
+; CHECK-NEXT:    lui a7, %hi(r2)
+; CHECK-NEXT:    flw fs3, %lo(f4)(t0)
+; CHECK-NEXT:    lui t0, %hi(r1)
+; CHECK-NEXT:    flw fs2, %lo(f3)(t1)
+; CHECK-NEXT:    lui t1, %hi(hp)
+; CHECK-NEXT:    flw fs1, %lo(f2)(t2)
+; CHECK-NEXT:    lui t2, %hi(sp)
 ; CHECK-NEXT:    flw fs0, %lo(f1)(a0)
-; CHECK-NEXT:    lui a0, %hi(splim)
-; CHECK-NEXT:    lw s11, %lo(splim)(a0)
-; CHECK-NEXT:    lui a0, %hi(r7)
-; CHECK-NEXT:    lw s10, %lo(r7)(a0)
-; CHECK-NEXT:    lui a0, %hi(r6)
-; CHECK-NEXT:    lw s9, %lo(r6)(a0)
-; CHECK-NEXT:    lui a0, %hi(r5)
-; CHECK-NEXT:    lw s8, %lo(r5)(a0)
-; CHECK-NEXT:    lui a0, %hi(r4)
-; CHECK-NEXT:    lw s7, %lo(r4)(a0)
-; CHECK-NEXT:    lui a0, %hi(r3)
-; CHECK-NEXT:    lw s6, %lo(r3)(a0)
-; CHECK-NEXT:    lui a0, %hi(r2)
-; CHECK-NEXT:    lw s5, %lo(r2)(a0)
-; CHECK-NEXT:    lui a0, %hi(r1)
-; CHECK-NEXT:    lw s4, %lo(r1)(a0)
-; CHECK-NEXT:    lui a0, %hi(hp)
-; CHECK-NEXT:    lw s3, %lo(hp)(a0)
-; CHECK-NEXT:    lui a0, %hi(sp)
-; CHECK-NEXT:    lw s2, %lo(sp)(a0)
 ; CHECK-NEXT:    lui a0, %hi(base)
+; CHECK-NEXT:    lw s11, %lo(splim)(a1)
+; CHECK-NEXT:    lw s10, %lo(r7)(a2)
+; CHECK-NEXT:    lw s9, %lo(r6)(a3)
+; CHECK-NEXT:    lw s8, %lo(r5)(a4)
+; CHECK-NEXT:    lw s7, %lo(r4)(a5)
+; CHECK-NEXT:    lw s6, %lo(r3)(a6)
+; CHECK-NEXT:    lw s5, %lo(r2)(a7)
+; CHECK-NEXT:    lw s4, %lo(r1)(t0)
+; CHECK-NEXT:    lw s3, %lo(hp)(t1)
+; CHECK-NEXT:    lw s2, %lo(sp)(t2)
 ; CHECK-NEXT:    lw s1, %lo(base)(a0)
 ; CHECK-NEXT:    tail bar
 entry:
diff --git a/llvm/test/CodeGen/RISCV/ghccc-rv64.ll b/llvm/test/CodeGen/RISCV/ghccc-rv64.ll
index 79afd4bc375d53..8e3fd2ca709aa5 100644
--- a/llvm/test/CodeGen/RISCV/ghccc-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/ghccc-rv64.ll
@@ -33,50 +33,50 @@ define ghccc void @foo() nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(d6)
+; CHECK-NEXT:    lui a1, %hi(d5)
+; CHECK-NEXT:    lui a2, %hi(d4)
+; CHECK-NEXT:    lui a3, %hi(d3)
+; CHECK-NEXT:    lui a4, %hi(d2)
+; CHECK-NEXT:    lui a5, %hi(d1)
+; CHECK-NEXT:    lui a6, %hi(f6)
+; CHECK-NEXT:    lui a7, %hi(f5)
+; CHECK-NEXT:    lui t0, %hi(f4)
+; CHECK-NEXT:    lui t1, %hi(f3)
+; CHECK-NEXT:    lui t2, %hi(f2)
 ; CHECK-NEXT:    fld fs11, %lo(d6)(a0)
-; CHECK-NEXT:    lui a0, %hi(d5)
-; CHECK-NEXT:    fld fs10, %lo(d5)(a0)
-; CHECK-NEXT:    lui a0, %hi(d4)
-; CHECK-NEXT:    fld fs9, %lo(d4)(a0)
-; CHECK-NEXT:    lui a0, %hi(d3)
-; CHECK-NEXT:    fld fs8, %lo(d3)(a0)
-; CHECK-NEXT:    lui a0, %hi(d2)
-; CHECK-NEXT:    fld fs7, %lo(d2)(a0)
-; CHECK-NEXT:    lui a0, %hi(d1)
-; CHECK-NEXT:    fld fs6, %lo(d1)(a0)
-; CHECK-NEXT:    lui a0, %hi(f6)
-; CHECK-NEXT:    flw fs5, %lo(f6)(a0)
-; CHECK-NEXT:    lui a0, %hi(f5)
-; CHECK-NEXT:    flw fs4, %lo(f5)(a0)
-; CHECK-NEXT:    lui a0, %hi(f4)
-; CHECK-NEXT:    flw fs3, %lo(f4)(a0)
-; CHECK-NEXT:    lui a0, %hi(f3)
-; CHECK-NEXT:    flw fs2, %lo(f3)(a0)
-; CHECK-NEXT:    lui a0, %hi(f2)
-; CHECK-NEXT:    flw fs1, %lo(f2)(a0)
 ; CHECK-NEXT:    lui a0, %hi(f1)
+; CHECK-NEXT:    fld fs10, %lo(d5)(a1)
+; CHECK-NEXT:    lui a1, %hi(splim)
+; CHECK-NEXT:    fld fs9, %lo(d4)(a2)
+; CHECK-NEXT:    lui a2, %hi(r7)
+; CHECK-NEXT:    fld fs8, %lo(d3)(a3)
+; CHECK-NEXT:    lui a3, %hi(r6)
+; CHECK-NEXT:    fld fs7, %lo(d2)(a4)
+; CHECK-NEXT:    lui a4, %hi(r5)
+; CHECK-NEXT:    fld fs6, %lo(d1)(a5)
+; CHECK-NEXT:    lui a5, %hi(r4)
+; CHECK-NEXT:    flw fs5, %lo(f6)(a6)
+; CHECK-NEXT:    lui a6, %hi(r3)
+; CHECK-NEXT:    flw fs4, %lo(f5)(a7)
+; CHECK-NEXT:    lui a7, %hi(r2)
+; CHECK-NEXT:    flw fs3, %lo(f4)(t0)
+; CHECK-NEXT:    lui t0, %hi(r1)
+; CHECK-NEXT:    flw fs2, %lo(f3)(t1)
+; CHECK-NEXT:    lui t1, %hi(hp)
+; CHECK-NEXT:    flw fs1, %lo(f2)(t2)
+; CHECK-NEXT:    lui t2, %hi(sp)
 ; CHECK-NEXT:    flw fs0, %lo(f1)(a0)
-; CHECK-NEXT:    lui a0, %hi(splim)
-; CHECK-NEXT:    ld s11, %lo(splim)(a0)
-; CHECK-NEXT:    lui a0, %hi(r7)
-; CHECK-NEXT:    ld s10, %lo(r7)(a0)
-; CHECK-NEXT:    lui a0, %hi(r6)
-; CHECK-NEXT:    ld s9, %lo(r6)(a0)
-; CHECK-NEXT:    lui a0, %hi(r5)
-; CHECK-NEXT:    ld s8, %lo(r5)(a0)
-; CHECK-NEXT:    lui a0, %hi(r4)
-; CHECK-NEXT:    ld s7, %lo(r4)(a0)
-; CHECK-NEXT:    lui a0, %hi(r3)
-; CHECK-NEXT:    ld s6, %lo(r3)(a0)
-; CHECK-NEXT:    lui a0, %hi(r2)
-; CHECK-NEXT:    ld s5, %lo(r2)(a0)
-; CHECK-NEXT:    lui a0, %hi(r1)
-; CHECK-NEXT:    ld s4, %lo(r1)(a0)
-; CHECK-NEXT:    lui a0, %hi(hp)
-; CHECK-NEXT:    ld s3, %lo(hp)(a0)
-; CHECK-NEXT:    lui a0, %hi(sp)
-; CHECK-NEXT:    ld s2, %lo(sp)(a0)
 ; CHECK-NEXT:    lui a0, %hi(base)
+; CHECK-NEXT:    ld s11, %lo(splim)(a1)
+; CHECK-NEXT:    ld s10, %lo(r7)(a2)
+; CHECK-NEXT:    ld s9, %lo(r6)(a3)
+; CHECK-NEXT:    ld s8, %lo(r5)(a4)
+; CHECK-NEXT:    ld s7, %lo(r4)(a5)
+; CHECK-NEXT:    ld s6, %lo(r3)(a6)
+; CHECK-NEXT:    ld s5, %lo(r2)(a7)
+; CHECK-NEXT:    ld s4, %lo(r1)(t0)
+; CHECK-NEXT:    ld s3, %lo(hp)(t1)
+; CHECK-NEXT:    ld s2, %lo(sp)(t2)
 ; CHECK-NEXT:    ld s1, %lo(base)(a0)
 ; CHECK-NEXT:    tail bar
 entry:
diff --git a/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
index 6437beae090154..abc555b994a3b9 100644
--- a/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
@@ -14,17 +14,17 @@ define ghccc void @caller_float() nounwind {
 ; CHECK-LABEL: caller_float:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(f6)
+; CHECK-NEXT:    lui a1, %hi(f5)
+; CHECK-NEXT:    lui a2, %hi(f4)
+; CHECK-NEXT:    lui a3, %hi(f3)
+; CHECK-NEXT:    lui a4, %hi(f2)
+; CHECK-NEXT:    lui a5, %hi(f1)
 ; CHECK-NEXT:    lw s6, %lo(f6)(a0)
-; CHECK-NEXT:    lui a0, %hi(f5)
-; CHECK-NEXT:    lw s5, %lo(f5)(a0)
-; CHECK-NEXT:    lui a0, %hi(f4)
-; CHECK-NEXT:    lw s4, %lo(f4)(a0)
-; CHECK-NEXT:    lui a0, %hi(f3)
-; CHECK-NEXT:    lw s3, %lo(f3)(a0)
-; CHECK-NEXT:    lui a0, %hi(f2)
-; CHECK-NEXT:    lw s2, %lo(f2)(a0)
-; CHECK-NEXT:    lui a0, %hi(f1)
-; CHECK-NEXT:    lw s1, %lo(f1)(a0)
+; CHECK-NEXT:    lw s5, %lo(f5)(a1)
+; CHECK-NEXT:    lw s4, %lo(f4)(a2)
+; CHECK-NEXT:    lw s3, %lo(f3)(a3)
+; CHECK-NEXT:    lw s2, %lo(f2)(a4)
+; CHECK-NEXT:    lw s1, %lo(f1)(a5)
 ; CHECK-NEXT:    tail callee_float
 entry:
   %0  = load float, ptr @f6
@@ -50,17 +50,17 @@ define ghccc void @caller_double() nounwind {
 ; CHECK-LABEL: caller_double:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(d6)
+; CHECK-NEXT:    lui a1, %hi(d5)
+; CHECK-NEXT:    lui a2, %hi(d4)
+; CHECK-NEXT:    lui a3, %hi(d3)
+; CHECK-NEXT:    lui a4, %hi(d2)
+; CHECK-NEXT:    lui a5, %hi(d1)
 ; CHECK-NEXT:    ld s6, %lo(d6)(a0)
-; CHECK-NEXT:    lui a0, %hi(d5)
-; CHECK-NEXT:    ld s5, %lo(d5)(a0)
-; CHECK-NEXT:    lui a0, %hi(d4)
-; CHECK-NEXT:    ld s4, %lo(d4)(a0)
-; CHECK-NEXT:    lui a0, %hi(d3)
-; CHECK-NEXT:    ld s3, %lo(d3)(a0)
-; CHECK-NEXT:    lui a0, %hi(d2)
-; CHECK-NEXT:    ld s2, %lo(d2)(a0)
-; CHECK-NEXT:    lui a0, %hi(d1)
-; CHECK-NEXT:    ld s1, %lo(d1)(a0)
+; CHECK-NEXT:    ld s5, %lo(d5)(a1)
+; CHECK-NEXT:    ld s4, %lo(d4)(a2)
+; CHECK-NEXT:    ld s3, %lo(d3)(a3)
+; CHECK-NEXT:    ld s2, %lo(d2)(a4)
+; CHECK-NEXT:    ld s1, %lo(d1)(a5)
 ; CHECK-NEXT:    tail callee_double
 entry:
   %0  = load double, ptr @d6
diff --git a/llvm/test/CodeGen/RISCV/global-merge.ll b/llvm/test/CodeGen/RISCV/global-merge.ll
index 633ba719c6a305..9dde032b69f8b1 100644
--- a/llvm/test/CodeGen/RISCV/global-merge.ll
+++ b/llvm/test/CodeGen/RISCV/global-merge.ll
@@ -23,12 +23,12 @@
 define void @f1(i32 %a) nounwind {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, %hi(eg1)
+; CHECK-NEXT:    sw a0, %lo(eg1)(a1)
 ; CHECK-NEXT:    lui a1, %hi(.L_MergedGlobals)
 ; CHECK-NEXT:    sw a0, %lo(.L_MergedGlobals)(a1)
 ; CHECK-NEXT:    addi a1, a1, %lo(.L_MergedGlobals)
 ; CHECK-NEXT:    sw a0, 4(a1)
-; CHECK-NEXT:    lui a1, %hi(eg1)
-; CHECK-NEXT:    sw a0, %lo(eg1)(a1)
 ; CHECK-NEXT:    lui a1, %hi(eg2)
 ; CHECK-NEXT:    sw a0, %lo(eg2)(a1)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/half-arith-strict.ll b/llvm/test/CodeGen/RISCV/half-arith-strict.ll
index 4e4aad7309791c..636739cf38984a 100644
--- a/llvm/test/CodeGen/RISCV/half-arith-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith-strict.ll
@@ -243,28 +243,28 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa2
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa1
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK-ZFHMIN-NEXT:    ret
 ;
 ; CHECK-ZHINXMIN-LABEL: fmsub_h:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -295,17 +295,17 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
-; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECK-ZFHMIN-NEXT:    xor a1, a1, a0
+; CHECK-ZFHMIN-NEXT:    xor a0, a2, a0
+; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -317,12 +317,12 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINXMIN-LABEL: fnmadd_h:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECK-ZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -359,17 +359,17 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
-; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECK-ZFHMIN-NEXT:    xor a1, a1, a0
+; CHECK-ZFHMIN-NEXT:    xor a0, a2, a0
+; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -381,12 +381,12 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINXMIN-LABEL: fnmadd_h_2:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECK-ZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -421,28 +421,28 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa5, fa4, fa3
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK-ZFHMIN-NEXT:    ret
 ;
 ; CHECK-ZHINXMIN-LABEL: fnmsub_h:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -471,28 +471,28 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa4, fa5, fa3
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK-ZFHMIN-NEXT:    ret
 ;
 ; CHECK-ZHINXMIN-LABEL: fnmsub_h_2:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index 4c2deafdc7e668..a218e89948d4b4 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -425,8 +425,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV32I-LABEL: fsgnj_h:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1048568
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -434,8 +434,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV64I-LABEL: fsgnj_h:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 1048568
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -469,8 +469,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
-; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -481,8 +481,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
-; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -573,11 +573,11 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-LABEL: fneg_h:
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -587,9 +587,9 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: fneg_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
 ; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a1, a0, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
@@ -644,8 +644,8 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lui a1, 1048568
-; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli s1, s1, 17
+; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    srli s1, s1, 17
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -683,8 +683,8 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    call __truncsfhf2
 ; RV64I-NEXT:    lui a1, 1048568
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli s1, s1, 49
+; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    srli s1, s1, 49
 ; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -699,12 +699,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV32IZFHMIN-NEXT:    lui a0, 1048568
 ; RV32IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fmv.x.h a0, fa5
-; RV32IZFHMIN-NEXT:    not a0, a0
-; RV32IZFHMIN-NEXT:    lui a1, 1048568
-; RV32IZFHMIN-NEXT:    and a0, a0, a1
+; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa5
+; RV32IZFHMIN-NEXT:    not a1, a1
+; RV32IZFHMIN-NEXT:    and a0, a1, a0
 ; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa0
 ; RV32IZFHMIN-NEXT:    slli a1, a1, 17
 ; RV32IZFHMIN-NEXT:    srli a1, a1, 17
@@ -716,12 +716,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV64IZFHMIN-NEXT:    lui a0, 1048568
 ; RV64IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa5
-; RV64IZFHMIN-NEXT:    not a0, a0
-; RV64IZFHMIN-NEXT:    lui a1, 1048568
-; RV64IZFHMIN-NEXT:    and a0, a0, a1
+; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa5
+; RV64IZFHMIN-NEXT:    not a1, a1
+; RV64IZFHMIN-NEXT:    and a0, a1, a0
 ; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa0
 ; RV64IZFHMIN-NEXT:    slli a1, a1, 49
 ; RV64IZFHMIN-NEXT:    srli a1, a1, 49
@@ -735,11 +735,11 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a0
 ; RV32IZHINXMIN-NEXT:    fadd.s a1, a2, a1
+; RV32IZHINXMIN-NEXT:    lui a2, 1048568
+; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; RV32IZHINXMIN-NEXT:    not a1, a1
-; RV32IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV32IZHINXMIN-NEXT:    and a1, a1, a2
-; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -751,11 +751,11 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a0
 ; RV64IZHINXMIN-NEXT:    fadd.s a1, a2, a1
+; RV64IZHINXMIN-NEXT:    lui a2, 1048568
+; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; RV64IZHINXMIN-NEXT:    not a1, a1
-; RV64IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV64IZHINXMIN-NEXT:    and a1, a1, a2
-; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -1298,28 +1298,28 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fmsub_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -1466,17 +1466,17 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
-; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECKIZFHMIN-NEXT:    xor a1, a1, a0
+; CHECKIZFHMIN-NEXT:    xor a0, a2, a0
+; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -1488,12 +1488,12 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmadd_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECKIZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -1646,17 +1646,17 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
-; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECKIZFHMIN-NEXT:    xor a1, a1, a0
+; CHECKIZFHMIN-NEXT:    xor a0, a2, a0
+; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -1668,12 +1668,12 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmadd_h_2:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECKIZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -2039,28 +2039,28 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa5, fa4, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fnmsub_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -2177,28 +2177,28 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa4, fa5, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fnmsub_h_2:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -2430,11 +2430,11 @@ define half @fmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa2, fa0
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; CHECKIZFHMIN-NEXT:    fmul.s fa4, fa2, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmul.s fa4, fa3, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
@@ -2445,13 +2445,13 @@ define half @fmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fmsub_h_contract:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
 ; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fsub.s a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -2606,21 +2606,21 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa3, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa2, fa2
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; CHECKIZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa2, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa3, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fmul.s fa5, fa5, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
@@ -2631,19 +2631,19 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmadd_h_contract:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
+; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
@@ -2781,10 +2781,10 @@ define half @fnmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -2799,10 +2799,10 @@ define half @fnmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmsub_h_contract:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
diff --git a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
index e0c47bfac6fec8..730bde5af610b2 100644
--- a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
@@ -165,8 +165,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    lui a2, 1048568
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -183,8 +183,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    not a1, a1
 ; RV64I-NEXT:    lui a2, 1048568
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -211,8 +211,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    not a1, a1
 ; RV32IZFHMIN-NEXT:    lui a2, 1048568
-; RV32IZFHMIN-NEXT:    and a1, a1, a2
 ; RV32IZFHMIN-NEXT:    slli a0, a0, 17
+; RV32IZFHMIN-NEXT:    and a1, a1, a2
 ; RV32IZFHMIN-NEXT:    srli a0, a0, 17
 ; RV32IZFHMIN-NEXT:    or a0, a0, a1
 ; RV32IZFHMIN-NEXT:    ret
@@ -221,8 +221,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    not a1, a1
 ; RV64IZFHMIN-NEXT:    lui a2, 1048568
-; RV64IZFHMIN-NEXT:    and a1, a1, a2
 ; RV64IZFHMIN-NEXT:    slli a0, a0, 49
+; RV64IZFHMIN-NEXT:    and a1, a1, a2
 ; RV64IZFHMIN-NEXT:    srli a0, a0, 49
 ; RV64IZFHMIN-NEXT:    or a0, a0, a1
 ; RV64IZFHMIN-NEXT:    ret
@@ -233,8 +233,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    not a1, a1
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
-; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -246,8 +246,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    not a1, a1
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
-; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
diff --git a/llvm/test/CodeGen/RISCV/half-br-fcmp.ll b/llvm/test/CodeGen/RISCV/half-br-fcmp.ll
index 6699ee94793796..e9b142e33362fa 100644
--- a/llvm/test/CodeGen/RISCV/half-br-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-br-fcmp.ll
@@ -927,9 +927,9 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV32IZFHMIN-LABEL: br_fcmp_ord:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV32IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFHMIN-NEXT:    and a0, a1, a0
 ; RV32IZFHMIN-NEXT:    bnez a0, .LBB8_2
 ; RV32IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -942,9 +942,9 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV64IZFHMIN-LABEL: br_fcmp_ord:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV64IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFHMIN-NEXT:    and a0, a1, a0
 ; RV64IZFHMIN-NEXT:    bnez a0, .LBB8_2
 ; RV64IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -957,8 +957,8 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-LABEL: br_fcmp_ord:
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    bnez a0, .LBB8_2
@@ -972,8 +972,8 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-LABEL: br_fcmp_ord:
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    bnez a0, .LBB8_2
@@ -1694,9 +1694,9 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV32IZFHMIN-LABEL: br_fcmp_uno:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV32IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFHMIN-NEXT:    and a0, a1, a0
 ; RV32IZFHMIN-NEXT:    beqz a0, .LBB15_2
 ; RV32IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -1709,9 +1709,9 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV64IZFHMIN-LABEL: br_fcmp_uno:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV64IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFHMIN-NEXT:    and a0, a1, a0
 ; RV64IZFHMIN-NEXT:    beqz a0, .LBB15_2
 ; RV64IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -1724,8 +1724,8 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-LABEL: br_fcmp_uno:
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    beqz a0, .LBB15_2
@@ -1739,8 +1739,8 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-LABEL: br_fcmp_uno:
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    beqz a0, .LBB15_2
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 054a9041a79267..01ffcab1a6556f 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -194,13 +194,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_si_h_sat:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32IZFH-NEXT:    lui a1, 815104
-; RV32IZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV32IZFH-NEXT:    lui a0, 815104
+; RV32IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IZFH-NEXT:    neg a0, a0
+; RV32IZFH-NEXT:    neg a0, a1
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IZFH-NEXT:    and a0, a0, a1
@@ -209,13 +209,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_si_h_sat:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64IZFH-NEXT:    lui a1, 815104
-; RV64IZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV64IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV64IZFH-NEXT:    lui a0, 815104
+; RV64IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IZFH-NEXT:    neg a0, a0
+; RV64IZFH-NEXT:    neg a0, a1
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IZFH-NEXT:    and a0, a0, a1
@@ -224,13 +224,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_si_h_sat:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32IDZFH-NEXT:    lui a1, 815104
-; RV32IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV32IDZFH-NEXT:    lui a0, 815104
+; RV32IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IDZFH-NEXT:    neg a0, a0
+; RV32IDZFH-NEXT:    neg a0, a1
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IDZFH-NEXT:    and a0, a0, a1
@@ -239,13 +239,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_si_h_sat:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IDZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64IDZFH-NEXT:    lui a1, 815104
-; RV64IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV64IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV64IDZFH-NEXT:    lui a0, 815104
+; RV64IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IDZFH-NEXT:    neg a0, a0
+; RV64IDZFH-NEXT:    neg a0, a1
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IDZFH-NEXT:    and a0, a0, a1
@@ -254,57 +254,57 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_si_h_sat:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a2, 815104
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a1, 815104
 ; RV32IZHINX-NEXT:    lui a2, 290816
+; RV32IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINX-NEXT:    addi a2, a2, -512
-; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    neg a0, a0
+; RV32IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZHINX-NEXT:    and a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_si_h_sat:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a2, 815104
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a1, 815104
 ; RV64IZHINX-NEXT:    lui a2, 290816
+; RV64IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZHINX-NEXT:    and a0, a1, a0
+; RV64IZHINX-NEXT:    neg a0, a0
+; RV64IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZHINX-NEXT:    and a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_si_h_sat:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a2, 815104
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV32IZDINXZHINX-NEXT:    lui a2, 290816
+; RV32IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZDINXZHINX-NEXT:    addi a2, a2, -512
-; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    neg a0, a0
+; RV32IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZDINXZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_si_h_sat:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a2, 815104
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV64IZDINXZHINX-NEXT:    lui a2, 290816
+; RV64IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZDINXZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV64IZDINXZHINX-NEXT:    neg a0, a0
+; RV64IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZDINXZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_si_h_sat:
@@ -399,13 +399,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
-; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
-; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32ID-ILP32-NEXT:    lui a1, 815104
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a1
+; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV32ID-ILP32-NEXT:    lui a0, 815104
+; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a0
 ; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa3
-; RV32ID-ILP32-NEXT:    neg a0, a0
+; RV32ID-ILP32-NEXT:    neg a0, a1
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-ILP32-NEXT:    and a0, a0, a1
@@ -419,13 +419,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
-; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
-; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64ID-LP64-NEXT:    lui a1, 815104
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, a1
+; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV64ID-LP64-NEXT:    feq.s a1, fa5, fa5
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV64ID-LP64-NEXT:    lui a0, 815104
+; RV64ID-LP64-NEXT:    fmv.w.x fa3, a0
 ; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa3
-; RV64ID-LP64-NEXT:    neg a0, a0
+; RV64ID-LP64-NEXT:    neg a0, a1
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-LP64-NEXT:    and a0, a0, a1
@@ -474,13 +474,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK32-IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK32-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK32-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK32-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK32-IZFHMIN-NEXT:    neg a0, a0
+; CHECK32-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    and a0, a0, a1
@@ -489,13 +489,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK64-IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK64-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK64-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK64-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK64-IZFHMIN-NEXT:    neg a0, a0
+; CHECK64-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    and a0, a0, a1
@@ -504,57 +504,57 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
 start:
   %0 = tail call i16 @llvm.fptosi.sat.i16.f16(half %a)
@@ -756,8 +756,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_ui_h_sat:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    lui a1, 292864
+; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -766,8 +766,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64IZHINX-LABEL: fcvt_ui_h_sat:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    lui a1, 292864
+; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -776,8 +776,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-LABEL: fcvt_ui_h_sat:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    lui a1, 292864
+; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -786,8 +786,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64IZDINXZHINX-LABEL: fcvt_ui_h_sat:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    lui a1, 292864
+; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -956,8 +956,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -966,8 +966,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK64-IZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -976,8 +976,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -986,8 +986,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -2153,7 +2153,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFH-NEXT:  # %bb.1: # %start
@@ -2161,19 +2161,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:  .LBB10_2: # %start
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB10_4
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB10_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB10_4: # %start
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
 ; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    neg a3, s0
+; RV32IZFH-NEXT:    neg a1, s0
+; RV32IZFH-NEXT:    neg a3, a3
+; RV32IZFH-NEXT:    and a0, a1, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a4, a0
 ; RV32IZFH-NEXT:    and a0, a3, a0
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2201,7 +2201,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFH-NEXT:    call __fixsfdi
-; RV32IDZFH-NEXT:    lui a4, 524288
+; RV32IDZFH-NEXT:    lui a3, 524288
 ; RV32IDZFH-NEXT:    lui a2, 524288
 ; RV32IDZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFH-NEXT:  # %bb.1: # %start
@@ -2209,19 +2209,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:  .LBB10_2: # %start
 ; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IDZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IDZFH-NEXT:    beqz a3, .LBB10_4
+; RV32IDZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IDZFH-NEXT:    beqz a1, .LBB10_4
 ; RV32IDZFH-NEXT:  # %bb.3:
-; RV32IDZFH-NEXT:    addi a2, a4, -1
+; RV32IDZFH-NEXT:    addi a2, a3, -1
 ; RV32IDZFH-NEXT:  .LBB10_4: # %start
-; RV32IDZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFH-NEXT:    feq.s a3, fs0, fs0
 ; RV32IDZFH-NEXT:    neg a4, a1
-; RV32IDZFH-NEXT:    and a1, a4, a2
-; RV32IDZFH-NEXT:    neg a2, a3
-; RV32IDZFH-NEXT:    neg a3, s0
+; RV32IDZFH-NEXT:    neg a1, s0
+; RV32IDZFH-NEXT:    neg a3, a3
+; RV32IDZFH-NEXT:    and a0, a1, a0
+; RV32IDZFH-NEXT:    and a1, a3, a2
+; RV32IDZFH-NEXT:    or a0, a4, a0
 ; RV32IDZFH-NEXT:    and a0, a3, a0
-; RV32IDZFH-NEXT:    or a0, a2, a0
-; RV32IDZFH-NEXT:    and a0, a4, a0
 ; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -2248,7 +2248,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB10_2
 ; RV32IZHINX-NEXT:  # %bb.1: # %start
@@ -2256,19 +2256,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:  .LBB10_2: # %start
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB10_4
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB10_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB10_4: # %start
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
 ; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    neg a3, s1
+; RV32IZHINX-NEXT:    neg a1, s1
+; RV32IZHINX-NEXT:    neg a3, a3
+; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a4, a0
 ; RV32IZHINX-NEXT:    and a0, a3, a0
-; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a0, a4, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2295,7 +2295,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZDINXZHINX-NEXT:    mv a0, s0
 ; RV32IZDINXZHINX-NEXT:    call __fixsfdi
-; RV32IZDINXZHINX-NEXT:    lui a4, 524288
+; RV32IZDINXZHINX-NEXT:    lui a3, 524288
 ; RV32IZDINXZHINX-NEXT:    lui a2, 524288
 ; RV32IZDINXZHINX-NEXT:    beqz s1, .LBB10_2
 ; RV32IZDINXZHINX-NEXT:  # %bb.1: # %start
@@ -2303,19 +2303,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:  .LBB10_2: # %start
 ; RV32IZDINXZHINX-NEXT:    lui a1, 389120
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -1
-; RV32IZDINXZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZDINXZHINX-NEXT:    beqz a3, .LBB10_4
+; RV32IZDINXZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZDINXZHINX-NEXT:    beqz a1, .LBB10_4
 ; RV32IZDINXZHINX-NEXT:  # %bb.3:
-; RV32IZDINXZHINX-NEXT:    addi a2, a4, -1
+; RV32IZDINXZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZDINXZHINX-NEXT:  .LBB10_4: # %start
-; RV32IZDINXZHINX-NEXT:    feq.s a1, s0, s0
+; RV32IZDINXZHINX-NEXT:    feq.s a3, s0, s0
 ; RV32IZDINXZHINX-NEXT:    neg a4, a1
-; RV32IZDINXZHINX-NEXT:    and a1, a4, a2
-; RV32IZDINXZHINX-NEXT:    neg a2, a3
-; RV32IZDINXZHINX-NEXT:    neg a3, s1
+; RV32IZDINXZHINX-NEXT:    neg a1, s1
+; RV32IZDINXZHINX-NEXT:    neg a3, a3
+; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    and a1, a3, a2
+; RV32IZDINXZHINX-NEXT:    or a0, a4, a0
 ; RV32IZDINXZHINX-NEXT:    and a0, a3, a0
-; RV32IZDINXZHINX-NEXT:    or a0, a2, a0
-; RV32IZDINXZHINX-NEXT:    and a0, a4, a0
 ; RV32IZDINXZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2342,13 +2342,13 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    lui a1, 913408
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __fixsfdi
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui s5, 524288
 ; RV32I-NEXT:    bgez s0, .LBB10_2
@@ -2357,25 +2357,25 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32I-NEXT:  .LBB10_2: # %start
 ; RV32I-NEXT:    lui a1, 389120
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    blez a0, .LBB10_4
 ; RV32I-NEXT:  # %bb.3: # %start
 ; RV32I-NEXT:    addi s3, s5, -1
 ; RV32I-NEXT:  .LBB10_4: # %start
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __unordsf2
 ; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    sgtz a1, s4
+; RV32I-NEXT:    slti a2, s0, 0
 ; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    and a1, a0, s3
-; RV32I-NEXT:    sgtz a2, s4
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    slti a3, s0, 0
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    and a3, a3, s2
-; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    and a2, a2, s1
+; RV32I-NEXT:    or a2, a3, a2
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -2444,7 +2444,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    fsw fa4, 4(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    fle.s s0, fa5, fa4
 ; RV32ID-ILP32-NEXT:    call __fixsfdi
-; RV32ID-ILP32-NEXT:    lui a4, 524288
+; RV32ID-ILP32-NEXT:    lui a3, 524288
 ; RV32ID-ILP32-NEXT:    lui a2, 524288
 ; RV32ID-ILP32-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-ILP32-NEXT:  # %bb.1: # %start
@@ -2453,20 +2453,20 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
 ; RV32ID-ILP32-NEXT:    flw fa4, 4(sp) # 4-byte Folded Reload
-; RV32ID-ILP32-NEXT:    flt.s a3, fa5, fa4
+; RV32ID-ILP32-NEXT:    flt.s a1, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fmv.s fa5, fa4
-; RV32ID-ILP32-NEXT:    beqz a3, .LBB10_4
+; RV32ID-ILP32-NEXT:    beqz a1, .LBB10_4
 ; RV32ID-ILP32-NEXT:  # %bb.3:
-; RV32ID-ILP32-NEXT:    addi a2, a4, -1
+; RV32ID-ILP32-NEXT:    addi a2, a3, -1
 ; RV32ID-ILP32-NEXT:  .LBB10_4: # %start
-; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
+; RV32ID-ILP32-NEXT:    feq.s a3, fa5, fa5
 ; RV32ID-ILP32-NEXT:    neg a4, a1
-; RV32ID-ILP32-NEXT:    and a1, a4, a2
-; RV32ID-ILP32-NEXT:    neg a2, a3
-; RV32ID-ILP32-NEXT:    neg a3, s0
+; RV32ID-ILP32-NEXT:    neg a1, s0
+; RV32ID-ILP32-NEXT:    neg a3, a3
+; RV32ID-ILP32-NEXT:    and a0, a1, a0
+; RV32ID-ILP32-NEXT:    and a1, a3, a2
+; RV32ID-ILP32-NEXT:    or a0, a4, a0
 ; RV32ID-ILP32-NEXT:    and a0, a3, a0
-; RV32ID-ILP32-NEXT:    or a0, a2, a0
-; RV32ID-ILP32-NEXT:    and a0, a4, a0
 ; RV32ID-ILP32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    addi sp, sp, 16
@@ -2499,7 +2499,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    fmv.w.x fa5, a0
 ; RV32ID-NEXT:    fle.s s0, fa5, fa0
 ; RV32ID-NEXT:    call __fixsfdi
-; RV32ID-NEXT:    lui a4, 524288
+; RV32ID-NEXT:    lui a3, 524288
 ; RV32ID-NEXT:    lui a2, 524288
 ; RV32ID-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-NEXT:  # %bb.1: # %start
@@ -2507,19 +2507,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:  .LBB10_2: # %start
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32ID-NEXT:    flt.s a3, fa5, fs0
-; RV32ID-NEXT:    beqz a3, .LBB10_4
+; RV32ID-NEXT:    flt.s a1, fa5, fs0
+; RV32ID-NEXT:    beqz a1, .LBB10_4
 ; RV32ID-NEXT:  # %bb.3:
-; RV32ID-NEXT:    addi a2, a4, -1
+; RV32ID-NEXT:    addi a2, a3, -1
 ; RV32ID-NEXT:  .LBB10_4: # %start
-; RV32ID-NEXT:    feq.s a1, fs0, fs0
-; RV32ID-NEXT:    neg a4, a1
-; RV32ID-NEXT:    and a1, a4, a2
-; RV32ID-NEXT:    neg a2, s0
-; RV32ID-NEXT:    and a0, a2, a0
-; RV32ID-NEXT:    neg a2, a3
-; RV32ID-NEXT:    or a0, a2, a0
+; RV32ID-NEXT:    feq.s a3, fs0, fs0
+; RV32ID-NEXT:    neg a4, s0
+; RV32ID-NEXT:    neg a5, a1
+; RV32ID-NEXT:    neg a3, a3
 ; RV32ID-NEXT:    and a0, a4, a0
+; RV32ID-NEXT:    and a1, a3, a2
+; RV32ID-NEXT:    or a0, a5, a0
+; RV32ID-NEXT:    and a0, a3, a0
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -2552,7 +2552,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IFZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IFZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IFZFHMIN-NEXT:    call __fixsfdi
-; RV32IFZFHMIN-NEXT:    lui a4, 524288
+; RV32IFZFHMIN-NEXT:    lui a3, 524288
 ; RV32IFZFHMIN-NEXT:    lui a2, 524288
 ; RV32IFZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IFZFHMIN-NEXT:  # %bb.1: # %start
@@ -2560,19 +2560,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IFZFHMIN-NEXT:  .LBB10_2: # %start
 ; RV32IFZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IFZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IFZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IFZFHMIN-NEXT:    beqz a3, .LBB10_4
+; RV32IFZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IFZFHMIN-NEXT:    beqz a1, .LBB10_4
 ; RV32IFZFHMIN-NEXT:  # %bb.3:
-; RV32IFZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IFZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IFZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IFZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IFZFHMIN-NEXT:    feq.s a3, fs0, fs0
 ; RV32IFZFHMIN-NEXT:    neg a4, a1
-; RV32IFZFHMIN-NEXT:    and a1, a4, a2
-; RV32IFZFHMIN-NEXT:    neg a2, a3
-; RV32IFZFHMIN-NEXT:    neg a3, s0
+; RV32IFZFHMIN-NEXT:    neg a1, s0
+; RV32IFZFHMIN-NEXT:    neg a3, a3
+; RV32IFZFHMIN-NEXT:    and a0, a1, a0
+; RV32IFZFHMIN-NEXT:    and a1, a3, a2
+; RV32IFZFHMIN-NEXT:    or a0, a4, a0
 ; RV32IFZFHMIN-NEXT:    and a0, a3, a0
-; RV32IFZFHMIN-NEXT:    or a0, a2, a0
-; RV32IFZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IFZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2601,7 +2601,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFHMIN-NEXT:    call __fixsfdi
-; RV32IDZFHMIN-NEXT:    lui a4, 524288
+; RV32IDZFHMIN-NEXT:    lui a3, 524288
 ; RV32IDZFHMIN-NEXT:    lui a2, 524288
 ; RV32IDZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFHMIN-NEXT:  # %bb.1: # %start
@@ -2609,19 +2609,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFHMIN-NEXT:  .LBB10_2: # %start
 ; RV32IDZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IDZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IDZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IDZFHMIN-NEXT:    beqz a3, .LBB10_4
+; RV32IDZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IDZFHMIN-NEXT:    beqz a1, .LBB10_4
 ; RV32IDZFHMIN-NEXT:  # %bb.3:
-; RV32IDZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IDZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IDZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IDZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFHMIN-NEXT:    feq.s a3, fs0, fs0
 ; RV32IDZFHMIN-NEXT:    neg a4, a1
-; RV32IDZFHMIN-NEXT:    and a1, a4, a2
-; RV32IDZFHMIN-NEXT:    neg a2, a3
-; RV32IDZFHMIN-NEXT:    neg a3, s0
+; RV32IDZFHMIN-NEXT:    neg a1, s0
+; RV32IDZFHMIN-NEXT:    neg a3, a3
+; RV32IDZFHMIN-NEXT:    and a0, a1, a0
+; RV32IDZFHMIN-NEXT:    and a1, a3, a2
+; RV32IDZFHMIN-NEXT:    or a0, a4, a0
 ; RV32IDZFHMIN-NEXT:    and a0, a3, a0
-; RV32IDZFHMIN-NEXT:    or a0, a2, a0
-; RV32IDZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IDZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IDZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFHMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -2639,7 +2639,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; CHECK32-IZHINXMIN-NEXT:    mv a0, s0
 ; CHECK32-IZHINXMIN-NEXT:    call __fixsfdi
-; CHECK32-IZHINXMIN-NEXT:    lui a4, 524288
+; CHECK32-IZHINXMIN-NEXT:    lui a3, 524288
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 524288
 ; CHECK32-IZHINXMIN-NEXT:    beqz s1, .LBB10_2
 ; CHECK32-IZHINXMIN-NEXT:  # %bb.1: # %start
@@ -2647,19 +2647,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:  .LBB10_2: # %start
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 389120
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -1
-; CHECK32-IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; CHECK32-IZHINXMIN-NEXT:    beqz a3, .LBB10_4
+; CHECK32-IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; CHECK32-IZHINXMIN-NEXT:    beqz a1, .LBB10_4
 ; CHECK32-IZHINXMIN-NEXT:  # %bb.3:
-; CHECK32-IZHINXMIN-NEXT:    addi a2, a4, -1
+; CHECK32-IZHINXMIN-NEXT:    addi a2, a3, -1
 ; CHECK32-IZHINXMIN-NEXT:  .LBB10_4: # %start
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, s0, s0
+; CHECK32-IZHINXMIN-NEXT:    feq.s a3, s0, s0
 ; CHECK32-IZHINXMIN-NEXT:    neg a4, a1
-; CHECK32-IZHINXMIN-NEXT:    and a1, a4, a2
-; CHECK32-IZHINXMIN-NEXT:    neg a2, a3
-; CHECK32-IZHINXMIN-NEXT:    neg a3, s1
+; CHECK32-IZHINXMIN-NEXT:    neg a1, s1
+; CHECK32-IZHINXMIN-NEXT:    neg a3, a3
+; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    and a1, a3, a2
+; CHECK32-IZHINXMIN-NEXT:    or a0, a4, a0
 ; CHECK32-IZHINXMIN-NEXT:    and a0, a3, a0
-; CHECK32-IZHINXMIN-NEXT:    or a0, a2, a0
-; CHECK32-IZHINXMIN-NEXT:    and a0, a4, a0
 ; CHECK32-IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2687,7 +2687,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    mv a0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    call __fixsfdi
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a4, 524288
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a3, 524288
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 524288
 ; CHECK32-IZDINXZHINXMIN-NEXT:    beqz s1, .LBB10_2
 ; CHECK32-IZDINXZHINXMIN-NEXT:  # %bb.1: # %start
@@ -2695,19 +2695,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:  .LBB10_2: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 389120
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -1
-; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a3, a1, s0
-; CHECK32-IZDINXZHINXMIN-NEXT:    beqz a3, .LBB10_4
+; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a1, a1, s0
+; CHECK32-IZDINXZHINXMIN-NEXT:    beqz a1, .LBB10_4
 ; CHECK32-IZDINXZHINXMIN-NEXT:  # %bb.3:
-; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a4, -1
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a3, -1
 ; CHECK32-IZDINXZHINXMIN-NEXT:  .LBB10_4: # %start
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, s0, s0
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a3, s0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg a4, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a1, a4, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a2, a3
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a3, s1
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, s1
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a3, a3
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a1, a3, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, a4, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a3, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, a2, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a4, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2907,15 +2907,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; RV32IZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV32IZFH-NEXT:    flt.s a0, fa5, fa0
-; RV32IZFH-NEXT:    neg s0, a0
-; RV32IZFH-NEXT:    fmv.w.x fa5, zero
-; RV32IZFH-NEXT:    fle.s a0, fa5, fa0
+; RV32IZFH-NEXT:    fmv.w.x fa4, zero
+; RV32IZFH-NEXT:    fle.s a0, fa4, fa0
+; RV32IZFH-NEXT:    flt.s a1, fa5, fa0
+; RV32IZFH-NEXT:    neg s0, a1
 ; RV32IZFH-NEXT:    neg s1, a0
 ; RV32IZFH-NEXT:    call __fixunssfdi
 ; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    or a0, s0, a0
 ; RV32IZFH-NEXT:    and a1, s1, a1
+; RV32IZFH-NEXT:    or a0, s0, a0
 ; RV32IZFH-NEXT:    or a1, s0, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2941,15 +2941,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; RV32IDZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV32IDZFH-NEXT:    flt.s a0, fa5, fa0
-; RV32IDZFH-NEXT:    neg s0, a0
-; RV32IDZFH-NEXT:    fmv.w.x fa5, zero
-; RV32IDZFH-NEXT:    fle.s a0, fa5, fa0
+; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
+; RV32IDZFH-NEXT:    fle.s a0, fa4, fa0
+; RV32IDZFH-NEXT:    flt.s a1, fa5, fa0
+; RV32IDZFH-NEXT:    neg s0, a1
 ; RV32IDZFH-NEXT:    neg s1, a0
 ; RV32IDZFH-NEXT:    call __fixunssfdi
 ; RV32IDZFH-NEXT:    and a0, s1, a0
-; RV32IDZFH-NEXT:    or a0, s0, a0
 ; RV32IDZFH-NEXT:    and a1, s1, a1
+; RV32IDZFH-NEXT:    or a0, s0, a0
 ; RV32IDZFH-NEXT:    or a1, s0, a1
 ; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2975,14 +2975,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    lui a1, 391168
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
+; RV32IZHINX-NEXT:    fle.s a2, zero, a0
 ; RV32IZHINX-NEXT:    flt.s a1, a1, a0
 ; RV32IZHINX-NEXT:    neg s0, a1
-; RV32IZHINX-NEXT:    fle.s a1, zero, a0
-; RV32IZHINX-NEXT:    neg s1, a1
+; RV32IZHINX-NEXT:    neg s1, a2
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
-; RV32IZHINX-NEXT:    or a0, s0, a0
 ; RV32IZHINX-NEXT:    and a1, s1, a1
+; RV32IZHINX-NEXT:    or a0, s0, a0
 ; RV32IZHINX-NEXT:    or a1, s0, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3008,14 +3008,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZDINXZHINX-NEXT:    lui a1, 391168
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -1
+; RV32IZDINXZHINX-NEXT:    fle.s a2, zero, a0
 ; RV32IZDINXZHINX-NEXT:    flt.s a1, a1, a0
 ; RV32IZDINXZHINX-NEXT:    neg s0, a1
-; RV32IZDINXZHINX-NEXT:    fle.s a1, zero, a0
-; RV32IZDINXZHINX-NEXT:    neg s1, a1
+; RV32IZDINXZHINX-NEXT:    neg s1, a2
 ; RV32IZDINXZHINX-NEXT:    call __fixunssfdi
 ; RV32IZDINXZHINX-NEXT:    and a0, s1, a0
-; RV32IZDINXZHINX-NEXT:    or a0, s0, a0
 ; RV32IZDINXZHINX-NEXT:    and a1, s1, a1
+; RV32IZDINXZHINX-NEXT:    or a0, s0, a0
 ; RV32IZDINXZHINX-NEXT:    or a1, s0, a1
 ; RV32IZDINXZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3054,8 +3054,8 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfdi
 ; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    and a1, s2, a1
+; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    or a1, s1, a1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3106,15 +3106,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI12_0)
 ; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
-; RV32ID-ILP32-NEXT:    flt.s a1, fa5, fa4
-; RV32ID-ILP32-NEXT:    neg s0, a1
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, zero
-; RV32ID-ILP32-NEXT:    fle.s a1, fa5, fa4
+; RV32ID-ILP32-NEXT:    fmv.w.x fa3, zero
+; RV32ID-ILP32-NEXT:    fle.s a1, fa3, fa4
+; RV32ID-ILP32-NEXT:    flt.s a2, fa5, fa4
+; RV32ID-ILP32-NEXT:    neg s0, a2
 ; RV32ID-ILP32-NEXT:    neg s1, a1
 ; RV32ID-ILP32-NEXT:    call __fixunssfdi
 ; RV32ID-ILP32-NEXT:    and a0, s1, a0
-; RV32ID-ILP32-NEXT:    or a0, s0, a0
 ; RV32ID-ILP32-NEXT:    and a1, s1, a1
+; RV32ID-ILP32-NEXT:    or a0, s0, a0
 ; RV32ID-ILP32-NEXT:    or a1, s0, a1
 ; RV32ID-ILP32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3146,15 +3146,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; RV32ID-NEXT:    flt.s a0, fa5, fa0
-; RV32ID-NEXT:    neg s0, a0
-; RV32ID-NEXT:    fmv.w.x fa5, zero
-; RV32ID-NEXT:    fle.s a0, fa5, fa0
+; RV32ID-NEXT:    fmv.w.x fa4, zero
+; RV32ID-NEXT:    fle.s a0, fa4, fa0
+; RV32ID-NEXT:    flt.s a1, fa5, fa0
+; RV32ID-NEXT:    neg s0, a1
 ; RV32ID-NEXT:    neg s1, a0
 ; RV32ID-NEXT:    call __fixunssfdi
 ; RV32ID-NEXT:    and a0, s1, a0
-; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    and a1, s1, a1
+; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    or a1, s0, a1
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3185,15 +3185,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK32-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
-; CHECK32-IZFHMIN-NEXT:    flt.s a0, fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    neg s0, a0
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa5, zero
-; CHECK32-IZFHMIN-NEXT:    fle.s a0, fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK32-IZFHMIN-NEXT:    fle.s a0, fa4, fa0
+; CHECK32-IZFHMIN-NEXT:    flt.s a1, fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    neg s0, a1
 ; CHECK32-IZFHMIN-NEXT:    neg s1, a0
 ; CHECK32-IZFHMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZFHMIN-NEXT:    and a0, s1, a0
-; CHECK32-IZFHMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZFHMIN-NEXT:    and a1, s1, a1
+; CHECK32-IZFHMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZFHMIN-NEXT:    or a1, s0, a1
 ; CHECK32-IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3220,14 +3220,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 391168
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -1
+; CHECK32-IZHINXMIN-NEXT:    fle.s a2, zero, a0
 ; CHECK32-IZHINXMIN-NEXT:    flt.s a1, a1, a0
 ; CHECK32-IZHINXMIN-NEXT:    neg s0, a1
-; CHECK32-IZHINXMIN-NEXT:    fle.s a1, zero, a0
-; CHECK32-IZHINXMIN-NEXT:    neg s1, a1
+; CHECK32-IZHINXMIN-NEXT:    neg s1, a2
 ; CHECK32-IZHINXMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZHINXMIN-NEXT:    and a0, s1, a0
-; CHECK32-IZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZHINXMIN-NEXT:    and a1, s1, a1
+; CHECK32-IZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZHINXMIN-NEXT:    or a1, s0, a1
 ; CHECK32-IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3254,14 +3254,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 391168
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -1
+; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s a2, zero, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a1, a1, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg s0, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s a1, zero, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg s1, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg s1, a2
 ; CHECK32-IZDINXZHINXMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, s1, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a1, s1, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    or a1, s0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -6296,13 +6296,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV32IZFH-NEXT:    lui a1, 815104
-; RV32IZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV32IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV32IZFH-NEXT:    lui a0, 815104
+; RV32IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IZFH-NEXT:    neg a0, a0
+; RV32IZFH-NEXT:    neg a0, a1
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IZFH-NEXT:    and a0, a0, a1
@@ -6311,13 +6311,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV64IZFH-NEXT:    lui a1, 815104
-; RV64IZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV64IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV64IZFH-NEXT:    lui a0, 815104
+; RV64IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IZFH-NEXT:    neg a0, a0
+; RV64IZFH-NEXT:    neg a0, a1
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IZFH-NEXT:    and a0, a0, a1
@@ -6326,13 +6326,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV32IDZFH-NEXT:    lui a1, 815104
-; RV32IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV32IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV32IDZFH-NEXT:    lui a0, 815104
+; RV32IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV32IDZFH-NEXT:    neg a0, a0
+; RV32IDZFH-NEXT:    neg a0, a1
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IDZFH-NEXT:    and a0, a0, a1
@@ -6341,13 +6341,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IDZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV64IDZFH-NEXT:    lui a1, 815104
-; RV64IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV64IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV64IDZFH-NEXT:    lui a0, 815104
+; RV64IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa3
-; RV64IDZFH-NEXT:    neg a0, a0
+; RV64IDZFH-NEXT:    neg a0, a1
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IDZFH-NEXT:    and a0, a0, a1
@@ -6356,57 +6356,57 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a2, 815104
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a1, 815104
 ; RV32IZHINX-NEXT:    lui a2, 290816
+; RV32IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINX-NEXT:    addi a2, a2, -512
-; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    neg a0, a0
+; RV32IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZHINX-NEXT:    and a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a2, 815104
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a1, 815104
 ; RV64IZHINX-NEXT:    lui a2, 290816
+; RV64IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZHINX-NEXT:    and a0, a1, a0
+; RV64IZHINX-NEXT:    neg a0, a0
+; RV64IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZHINX-NEXT:    and a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a2, 815104
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV32IZDINXZHINX-NEXT:    lui a2, 290816
+; RV32IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZDINXZHINX-NEXT:    addi a2, a2, -512
-; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    neg a0, a0
+; RV32IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZDINXZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a2, 815104
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV64IZDINXZHINX-NEXT:    lui a2, 290816
+; RV64IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZDINXZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV64IZDINXZHINX-NEXT:    neg a0, a0
+; RV64IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZDINXZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i16:
@@ -6505,13 +6505,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
-; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
-; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV32ID-ILP32-NEXT:    lui a1, 815104
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a1
+; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV32ID-ILP32-NEXT:    lui a0, 815104
+; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a0
 ; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa3
-; RV32ID-ILP32-NEXT:    neg a0, a0
+; RV32ID-ILP32-NEXT:    neg a0, a1
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-ILP32-NEXT:    and a0, a0, a1
@@ -6525,13 +6525,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
-; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
-; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV64ID-LP64-NEXT:    lui a1, 815104
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, a1
+; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV64ID-LP64-NEXT:    feq.s a1, fa5, fa5
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV64ID-LP64-NEXT:    lui a0, 815104
+; RV64ID-LP64-NEXT:    fmv.w.x fa3, a0
 ; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa3
-; RV64ID-LP64-NEXT:    neg a0, a0
+; RV64ID-LP64-NEXT:    neg a0, a1
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-LP64-NEXT:    and a0, a0, a1
@@ -6580,13 +6580,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK32-IZFHMIN-NEXT:    lui a1, %hi(.LCPI32_0)
-; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; CHECK32-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI32_0)
+; CHECK32-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; CHECK32-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK32-IZFHMIN-NEXT:    neg a0, a0
+; CHECK32-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    and a0, a0, a1
@@ -6595,13 +6595,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK64-IZFHMIN-NEXT:    lui a1, %hi(.LCPI32_0)
-; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; CHECK64-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI32_0)
+; CHECK64-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; CHECK64-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
-; CHECK64-IZFHMIN-NEXT:    neg a0, a0
+; CHECK64-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    and a0, a0, a1
@@ -6610,57 +6610,57 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
 start:
   %0 = tail call i16 @llvm.fptosi.sat.i16.f16(half %a)
@@ -6861,8 +6861,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    lui a1, 292864
+; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -6871,8 +6871,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64IZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    lui a1, 292864
+; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -6881,8 +6881,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    lui a1, 292864
+; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -6891,8 +6891,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    lui a1, 292864
+; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -7067,8 +7067,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -7077,8 +7077,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -7087,8 +7087,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -7097,8 +7097,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -7258,12 +7258,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFH-NEXT:    lui a0, 798720
+; RV32IZFH-NEXT:    lui a1, 274400
+; RV32IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV32IZFH-NEXT:    neg a0, a0
-; RV32IZFH-NEXT:    lui a1, 798720
-; RV32IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV32IZFH-NEXT:    lui a1, 274400
 ; RV32IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7273,12 +7273,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFH-NEXT:    lui a0, 798720
+; RV64IZFH-NEXT:    lui a1, 274400
+; RV64IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV64IZFH-NEXT:    neg a0, a0
-; RV64IZFH-NEXT:    lui a1, 798720
-; RV64IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV64IZFH-NEXT:    lui a1, 274400
 ; RV64IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7288,12 +7288,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IDZFH-NEXT:    lui a0, 798720
+; RV32IDZFH-NEXT:    lui a1, 274400
+; RV32IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV32IDZFH-NEXT:    neg a0, a0
-; RV32IDZFH-NEXT:    lui a1, 798720
-; RV32IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV32IDZFH-NEXT:    lui a1, 274400
 ; RV32IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7303,12 +7303,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IDZFH-NEXT:    lui a0, 798720
+; RV64IDZFH-NEXT:    lui a1, 274400
+; RV64IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV64IDZFH-NEXT:    neg a0, a0
-; RV64IDZFH-NEXT:    lui a1, 798720
-; RV64IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV64IDZFH-NEXT:    lui a1, 274400
 ; RV64IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7318,53 +7318,53 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a2, 798720
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a1, 798720
 ; RV32IZHINX-NEXT:    lui a2, 274400
-; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZHINX-NEXT:    feq.s a0, a0, a0
+; RV32IZHINX-NEXT:    neg a0, a0
+; RV32IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZHINX-NEXT:    and a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a2, 798720
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a1, 798720
 ; RV64IZHINX-NEXT:    lui a2, 274400
-; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZHINX-NEXT:    and a0, a1, a0
+; RV64IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZHINX-NEXT:    feq.s a0, a0, a0
+; RV64IZHINX-NEXT:    neg a0, a0
+; RV64IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZHINX-NEXT:    and a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a2, 798720
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a1, 798720
 ; RV32IZDINXZHINX-NEXT:    lui a2, 274400
-; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZDINXZHINX-NEXT:    feq.s a0, a0, a0
+; RV32IZDINXZHINX-NEXT:    neg a0, a0
+; RV32IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZDINXZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a2, 798720
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a1, 798720
 ; RV64IZDINXZHINX-NEXT:    lui a2, 274400
-; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV64IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZDINXZHINX-NEXT:    feq.s a0, a0, a0
+; RV64IZDINXZHINX-NEXT:    neg a0, a0
+; RV64IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZDINXZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i8:
@@ -7459,12 +7459,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
+; RV32ID-ILP32-NEXT:    lui a0, 798720
+; RV32ID-ILP32-NEXT:    lui a1, 274400
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
 ; RV32ID-ILP32-NEXT:    neg a0, a0
-; RV32ID-ILP32-NEXT:    lui a1, 798720
-; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
-; RV32ID-ILP32-NEXT:    lui a1, 274400
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7479,12 +7479,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    lui a0, 798720
+; RV64ID-LP64-NEXT:    lui a1, 274400
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
 ; RV64ID-LP64-NEXT:    neg a0, a0
-; RV64ID-LP64-NEXT:    lui a1, 798720
-; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
-; RV64ID-LP64-NEXT:    lui a1, 274400
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7499,11 +7499,11 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    feq.s a0, fa0, fa0
-; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    lui a1, 798720
 ; RV32ID-NEXT:    fmv.w.x fa5, a1
-; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    lui a1, 274400
+; RV32ID-NEXT:    neg a0, a0
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7518,11 +7518,11 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    feq.s a0, fa0, fa0
-; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    lui a1, 798720
 ; RV64ID-NEXT:    fmv.w.x fa5, a1
-; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    lui a1, 274400
+; RV64ID-NEXT:    neg a0, a0
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7534,12 +7534,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    lui a0, 798720
+; CHECK32-IZFHMIN-NEXT:    lui a1, 274400
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK32-IZFHMIN-NEXT:    neg a0, a0
-; CHECK32-IZFHMIN-NEXT:    lui a1, 798720
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK32-IZFHMIN-NEXT:    lui a1, 274400
 ; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7549,12 +7549,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK64-IZFHMIN-NEXT:    lui a0, 798720
+; CHECK64-IZFHMIN-NEXT:    lui a1, 274400
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK64-IZFHMIN-NEXT:    neg a0, a0
-; CHECK64-IZFHMIN-NEXT:    lui a1, 798720
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK64-IZFHMIN-NEXT:    lui a1, 274400
 ; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7564,53 +7564,53 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a2, 798720
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 274400
-; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK32-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a2, 798720
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK64-IZHINXMIN-NEXT:    lui a2, 274400
-; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK64-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 798720
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 274400
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 798720
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 274400
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
 start:
   %0 = tail call i8 @llvm.fptosi.sat.i8.f16(half %a)
@@ -7769,8 +7769,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV32IZFH-NEXT:    fmv.w.x fa4, zero
-; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    lui a0, 276464
+; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7780,8 +7780,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV64IZFH-NEXT:    fmv.w.x fa4, zero
-; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    lui a0, 276464
+; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7791,8 +7791,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
-; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    lui a0, 276464
+; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7802,8 +7802,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV64IDZFH-NEXT:    fmv.w.x fa4, zero
-; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    lui a0, 276464
+; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7926,8 +7926,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, zero
-; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    lui a0, 276464
+; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7942,8 +7942,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, zero
-; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    lui a0, 276464
+; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7957,8 +7957,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    fmv.w.x fa5, zero
-; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    lui a0, 276464
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7972,8 +7972,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    fmv.w.x fa5, zero
-; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    lui a0, 276464
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7985,8 +7985,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    lui a0, 276464
+; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7996,8 +7996,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    lui a0, 276464
+; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.lu.s a0, fa5, rtz
diff --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
index 4bc595bcc4cc8f..12cf088e3205fd 100644
--- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
@@ -297,8 +297,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmp_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    ret
@@ -608,8 +608,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmp_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a0, 1
@@ -823,8 +823,8 @@ define i32 @fcmps_ord(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmps_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fle.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    ret
@@ -1063,8 +1063,8 @@ define i32 @fcmps_uno(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmps_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fle.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/half-fcmp.ll b/llvm/test/CodeGen/RISCV/half-fcmp.ll
index 81ef56635eebcf..d25d8cc1c19948 100644
--- a/llvm/test/CodeGen/RISCV/half-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-fcmp.ll
@@ -426,9 +426,9 @@ define i32 @fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-ILP32F-LP64F-LABEL: fcmp_ord:
 ; CHECKIZFHMIN-ILP32F-LP64F:       # %bb.0:
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa1
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa4, fa0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a0, fa5, fa5
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa5, fa5
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa4, fa4
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    ret
 ;
@@ -437,8 +437,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a1
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    feq.s a1, fa5, fa5
 ; CHECKIZFHMIN-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    ret
@@ -446,8 +446,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: fcmp_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    ret
@@ -864,9 +864,9 @@ define i32 @fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-ILP32F-LP64F-LABEL: fcmp_uno:
 ; CHECKIZFHMIN-ILP32F-LP64F:       # %bb.0:
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa1
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa4, fa0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a0, fa5, fa5
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa5, fa5
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa4, fa4
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    xori a0, a0, 1
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    ret
@@ -876,8 +876,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a1
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    feq.s a1, fa5, fa5
 ; CHECKIZFHMIN-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    xori a0, a0, 1
@@ -886,8 +886,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: fcmp_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index f063c1fef4e16b..0d26e660c979bb 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -2050,8 +2050,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV32I-LABEL: copysign_f16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1048568
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -2059,8 +2059,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV64I-LABEL: copysign_f16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 1048568
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2094,8 +2094,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
-; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -2106,8 +2106,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
-; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -2801,8 +2801,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV32I-LABEL: isnan_d_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 17
-; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    li a1, 31
+; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    slli a1, a1, 10
 ; RV32I-NEXT:    slt a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -2810,8 +2810,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV64I-LABEL: isnan_d_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 49
-; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    li a1, 31
+; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    slli a1, a1, 10
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -2819,9 +2819,9 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV32IZFHMIN-LABEL: isnan_d_fpclass:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV32IZFHMIN-NEXT:    li a1, 31
 ; RV32IZFHMIN-NEXT:    slli a0, a0, 17
 ; RV32IZFHMIN-NEXT:    srli a0, a0, 17
-; RV32IZFHMIN-NEXT:    li a1, 31
 ; RV32IZFHMIN-NEXT:    slli a1, a1, 10
 ; RV32IZFHMIN-NEXT:    slt a0, a1, a0
 ; RV32IZFHMIN-NEXT:    ret
@@ -2829,9 +2829,9 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV64IZFHMIN-LABEL: isnan_d_fpclass:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV64IZFHMIN-NEXT:    li a1, 31
 ; RV64IZFHMIN-NEXT:    slli a0, a0, 49
 ; RV64IZFHMIN-NEXT:    srli a0, a0, 49
-; RV64IZFHMIN-NEXT:    li a1, 31
 ; RV64IZFHMIN-NEXT:    slli a1, a1, 10
 ; RV64IZFHMIN-NEXT:    slt a0, a1, a0
 ; RV64IZFHMIN-NEXT:    ret
@@ -2840,8 +2840,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
-; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    li a1, 31
+; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    slli a1, a1, 10
 ; RV32IZHINXMIN-NEXT:    slt a0, a1, a0
 ; RV32IZHINXMIN-NEXT:    ret
@@ -2850,8 +2850,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
-; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    li a1, 31
+; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    slli a1, a1, 10
 ; RV64IZHINXMIN-NEXT:    slt a0, a1, a0
 ; RV64IZHINXMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/half-mem.ll b/llvm/test/CodeGen/RISCV/half-mem.ll
index 5b6a94a83f94bf..9ac2a4d037f8a8 100644
--- a/llvm/test/CodeGen/RISCV/half-mem.ll
+++ b/llvm/test/CodeGen/RISCV/half-mem.ll
@@ -134,10 +134,10 @@ define half @flh_fsh_global(half %a, half %b) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECKIZFHMIN-NEXT:    lui a0, %hi(G)
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; CHECKIZFHMIN-NEXT:    flh fa4, %lo(G)(a0)
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    lui a0, %hi(G)
-; CHECKIZFHMIN-NEXT:    flh fa5, %lo(G)(a0)
 ; CHECKIZFHMIN-NEXT:    addi a1, a0, %lo(G)
 ; CHECKIZFHMIN-NEXT:    fsh fa0, %lo(G)(a0)
 ; CHECKIZFHMIN-NEXT:    flh fa5, 18(a1)
@@ -148,14 +148,14 @@ define half @flh_fsh_global(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a2, %hi(G)
 ; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    lh zero, %lo(G)(a2)
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    lui a1, %hi(G)
-; CHECKIZHINXMIN-NEXT:    lh zero, %lo(G)(a1)
-; CHECKIZHINXMIN-NEXT:    addi a2, a1, %lo(G)
-; CHECKIZHINXMIN-NEXT:    sh a0, %lo(G)(a1)
-; CHECKIZHINXMIN-NEXT:    lh zero, 18(a2)
-; CHECKIZHINXMIN-NEXT:    sh a0, 18(a2)
+; CHECKIZHINXMIN-NEXT:    addi a1, a2, %lo(G)
+; CHECKIZHINXMIN-NEXT:    sh a0, %lo(G)(a2)
+; CHECKIZHINXMIN-NEXT:    lh zero, 18(a1)
+; CHECKIZHINXMIN-NEXT:    sh a0, 18(a1)
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %b
   %2 = load volatile half, ptr @G
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 9e1a26e74d70b9..3b645bf8aef912 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -115,7 +115,7 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -123,19 +123,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB1_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI1_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB1_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB1_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB1_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -172,7 +172,7 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -180,19 +180,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB1_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB1_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB1_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -236,13 +236,13 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -250,19 +250,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB1_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB1_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB1_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -313,7 +313,7 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -321,19 +321,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB1_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB1_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB1_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -529,10 +529,10 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI3_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI3_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -572,11 +572,11 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -630,10 +630,10 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI3_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI3_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -687,11 +687,11 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -823,7 +823,7 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -831,19 +831,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB5_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI5_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB5_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB5_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB5_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -880,7 +880,7 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -888,19 +888,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB5_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB5_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB5_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -944,13 +944,13 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -958,19 +958,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB5_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI5_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB5_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB5_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1021,7 +1021,7 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -1029,19 +1029,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB5_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB5_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB5_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1237,10 +1237,10 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI7_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI7_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1280,11 +1280,11 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1338,10 +1338,10 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI7_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI7_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1395,11 +1395,11 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1531,7 +1531,7 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -1539,19 +1539,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB9_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI9_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI9_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB9_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB9_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB9_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1588,7 +1588,7 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -1596,19 +1596,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB9_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB9_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB9_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1652,13 +1652,13 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -1666,19 +1666,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB9_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI9_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB9_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB9_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1729,7 +1729,7 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -1737,19 +1737,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB9_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB9_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB9_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1945,10 +1945,10 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI11_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI11_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1988,11 +1988,11 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2046,10 +2046,10 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI11_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI11_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2103,11 +2103,11 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2239,7 +2239,7 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -2247,19 +2247,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB13_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI13_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI13_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB13_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB13_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB13_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2296,7 +2296,7 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -2304,19 +2304,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB13_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB13_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB13_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2360,13 +2360,13 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -2374,19 +2374,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB13_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI13_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB13_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB13_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2437,7 +2437,7 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -2445,19 +2445,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB13_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB13_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB13_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2653,10 +2653,10 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI15_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI15_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2696,11 +2696,11 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2754,10 +2754,10 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI15_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI15_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2811,11 +2811,11 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2947,7 +2947,7 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -2955,19 +2955,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB17_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI17_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI17_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB17_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB17_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB17_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3004,7 +3004,7 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -3012,19 +3012,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB17_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB17_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB17_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -3068,13 +3068,13 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -3082,19 +3082,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB17_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB17_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB17_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3145,7 +3145,7 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -3153,19 +3153,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB17_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB17_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB17_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -3361,10 +3361,10 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI19_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI19_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3404,11 +3404,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3462,10 +3462,10 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI19_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI19_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3519,11 +3519,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3655,7 +3655,7 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -3663,19 +3663,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB21_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI21_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI21_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB21_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB21_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB21_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3712,7 +3712,7 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -3720,19 +3720,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB21_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB21_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB21_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -3776,13 +3776,13 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -3790,19 +3790,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB21_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI21_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB21_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB21_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3853,7 +3853,7 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -3861,19 +3861,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB21_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB21_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB21_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -4069,10 +4069,10 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI23_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI23_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -4112,11 +4112,11 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -4170,10 +4170,10 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI23_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI23_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -4227,11 +4227,11 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
index b793c500fc397b..d92dcb9eac4c61 100644
--- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
@@ -358,8 +358,8 @@ define half @select_fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: select_fcmp_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a3, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    feq.s a3, a3, a3
 ; CHECKIZHINXMIN-NEXT:    and a2, a3, a2
 ; CHECKIZHINXMIN-NEXT:    bnez a2, .LBB7_2
@@ -689,8 +689,8 @@ define half @select_fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: select_fcmp_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a3, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    feq.s a3, a3, a3
 ; CHECKIZHINXMIN-NEXT:    and a2, a3, a2
 ; CHECKIZHINXMIN-NEXT:    beqz a2, .LBB14_2
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index a0c85ab4dca7f7..66cde323ce507d 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) {
 define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a3, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB8_2
+; RV32I-NEXT:    bgez a3, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    neg a5, a1
 ; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    snez a7, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    sltu t0, a5, a6
+; RV32I-NEXT:    snez a7, a2
 ; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, t0
-; RV32I-NEXT:    sub a1, a5, a6
 ; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sub a4, a4, a7
-; RV32I-NEXT:    neg a3, a3
+; RV32I-NEXT:    sltu a3, a5, a6
+; RV32I-NEXT:    neg a7, a1
+; RV32I-NEXT:    sub a1, a5, a6
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB8_2
+; RV32ZBB-NEXT:    bgez a3, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    neg a5, a1
 ; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    snez a7, a3
-; RV32ZBB-NEXT:    or a6, a7, a6
-; RV32ZBB-NEXT:    sltu t0, a5, a6
+; RV32ZBB-NEXT:    snez a7, a2
 ; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, t0
-; RV32ZBB-NEXT:    sub a1, a5, a6
 ; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    or a6, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sub a4, a4, a7
-; RV32ZBB-NEXT:    neg a3, a3
+; RV32ZBB-NEXT:    sltu a3, a5, a6
+; RV32ZBB-NEXT:    neg a7, a1
+; RV32ZBB-NEXT:    sub a1, a5, a6
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:  .LBB8_2:
-; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a4, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: abs128:
@@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) {
 define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a3, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB9_2
+; RV32I-NEXT:    bgez a3, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    neg a5, a1
 ; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    snez a7, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    sltu t0, a5, a6
+; RV32I-NEXT:    snez a7, a2
 ; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, t0
-; RV32I-NEXT:    sub a1, a5, a6
 ; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sub a4, a4, a7
-; RV32I-NEXT:    neg a3, a3
+; RV32I-NEXT:    sltu a3, a5, a6
+; RV32I-NEXT:    neg a7, a1
+; RV32I-NEXT:    sub a1, a5, a6
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB9_2
+; RV32ZBB-NEXT:    bgez a3, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    neg a5, a1
 ; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    snez a7, a3
-; RV32ZBB-NEXT:    or a6, a7, a6
-; RV32ZBB-NEXT:    sltu t0, a5, a6
+; RV32ZBB-NEXT:    snez a7, a2
 ; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, t0
-; RV32ZBB-NEXT:    sub a1, a5, a6
 ; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    or a6, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sub a4, a4, a7
-; RV32ZBB-NEXT:    neg a3, a3
+; RV32ZBB-NEXT:    sltu a3, a5, a6
+; RV32ZBB-NEXT:    neg a7, a1
+; RV32ZBB-NEXT:    sub a1, a5, a6
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:  .LBB9_2:
-; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a4, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: select_abs128:
diff --git a/llvm/test/CodeGen/RISCV/imm.ll b/llvm/test/CodeGen/RISCV/imm.ll
index 70bcb066fe4f0a..830f381b659d18 100644
--- a/llvm/test/CodeGen/RISCV/imm.ll
+++ b/llvm/test/CodeGen/RISCV/imm.ll
@@ -888,8 +888,8 @@ define i64 @imm64_8() nounwind {
 ; RV32I-LABEL: imm64_8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 633806
-; RV32I-NEXT:    addi a0, a0, -272
 ; RV32I-NEXT:    lui a1, 74565
+; RV32I-NEXT:    addi a0, a0, -272
 ; RV32I-NEXT:    addi a1, a1, 1656
 ; RV32I-NEXT:    ret
 ;
@@ -1190,8 +1190,8 @@ define i64 @imm_right_shifted_lui_1() nounwind {
 ; RV32I-LABEL: imm_right_shifted_lui_1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 1048575
-; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
@@ -1427,8 +1427,8 @@ define i64 @imm_end_2addi_1() nounwind {
 ; RV32I-LABEL: imm_end_2addi_1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 1048575
-; RV32I-NEXT:    addi a0, a0, 2047
 ; RV32I-NEXT:    lui a1, 1048512
+; RV32I-NEXT:    addi a0, a0, 2047
 ; RV32I-NEXT:    addi a1, a1, 127
 ; RV32I-NEXT:    ret
 ;
@@ -2446,8 +2446,8 @@ define i64 @imm_neg_8798043653189() {
 ; RV32I-LABEL: imm_neg_8798043653189:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 572348
-; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    lui a1, 1048575
+; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    addi a1, a1, 2047
 ; RV32I-NEXT:    ret
 ;
@@ -2512,8 +2512,8 @@ define i64 @imm_9223372034904144827() {
 ; RV32I-LABEL: imm_9223372034904144827:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 572348
-; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
@@ -2578,8 +2578,8 @@ define i64 @imm_neg_9223354442718100411() {
 ; RV32I-LABEL: imm_neg_9223354442718100411:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 572348
-; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    lui a1, 524287
+; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
@@ -2895,8 +2895,8 @@ define i64 @imm_12900924131259() {
 ; RV32I-LABEL: imm_12900924131259:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 765952
-; RV32I-NEXT:    addi a0, a0, 1979
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, 1979
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -3017,8 +3017,8 @@ define i64 @imm_12900936431479() {
 ; RV32I-LABEL: imm_12900936431479:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 768955
-; RV32I-NEXT:    addi a0, a0, 1911
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, 1911
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -3089,8 +3089,8 @@ define i64 @imm_12900918536874() {
 ; RV32I-LABEL: imm_12900918536874:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 764587
-; RV32I-NEXT:    addi a0, a0, -1366
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, -1366
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -3161,8 +3161,8 @@ define i64 @imm_12900925247761() {
 ; RV32I-LABEL: imm_12900925247761:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 766225
-; RV32I-NEXT:    addi a0, a0, 273
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, 273
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -4165,8 +4165,8 @@ define i64 @imm64_0xFF7FFFFF7FFFFFFE() {
 ; RV32I-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 524288
-; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    lui a1, 1046528
+; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
index b1afdded62d69a..d58e6fe7675da6 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
@@ -12,8 +12,8 @@ define double @constraint_f_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    fadd.d fa5, fa5, fa4
@@ -45,8 +45,8 @@ define double @constraint_cf_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    fadd.d fa5, fa5, fa4
@@ -78,8 +78,8 @@ define double @constraint_f_double_abi_name(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    fld fs0, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    fadd.d ft0, fa1, fs0
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
index 581cf8e3bf3c9e..238a0fa0b6fd72 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
@@ -15,8 +15,8 @@ define double @constraint_f_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20)
@@ -48,8 +48,8 @@ define double @constraint_cf_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20)
@@ -81,8 +81,8 @@ define double @constraint_f_double_abi_name(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    fld fs0, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    .insn 0x4, 0x02000053 | (0 << 7) | (11 << 15) | (8 << 20)
diff --git a/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
index fa6ac96b57b1eb..973eb9f41f4fe3 100644
--- a/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
+++ b/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
@@ -29,8 +29,8 @@ define void @foo_i32() nounwind #0 {
 ; CHECK-RV32-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(a)
-; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(b)
+; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32-NEXT:    add a0, a1, a0
 ; CHECK-RV32-NEXT:    lui a1, %hi(c)
@@ -46,8 +46,8 @@ define void @foo_i32() nounwind #0 {
 ; CHECK-RV32IF-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IF-NEXT:    add a0, a1, a0
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(c)
@@ -63,8 +63,8 @@ define void @foo_i32() nounwind #0 {
 ; CHECK-RV32IFD-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32IFD-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32IFD-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IFD-NEXT:    add a0, a1, a0
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(c)
@@ -94,8 +94,8 @@ define void @foo_fp_i32() nounwind #1 {
 ; CHECK-RV32-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi s0, sp, 16
 ; CHECK-RV32-NEXT:    lui a0, %hi(a)
-; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(b)
+; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32-NEXT:    add a0, a1, a0
 ; CHECK-RV32-NEXT:    lui a1, %hi(c)
@@ -116,8 +116,8 @@ define void @foo_fp_i32() nounwind #1 {
 ; CHECK-RV32IF-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    addi s0, sp, 16
 ; CHECK-RV32IF-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IF-NEXT:    add a0, a1, a0
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(c)
@@ -138,8 +138,8 @@ define void @foo_fp_i32() nounwind #1 {
 ; CHECK-RV32IFD-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32IFD-NEXT:    addi s0, sp, 16
 ; CHECK-RV32IFD-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IFD-NEXT:    add a0, a1, a0
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(c)
@@ -182,8 +182,8 @@ define void @foo_float() nounwind #0 {
 ; CHECK-RV32-NEXT:    sw t5, 4(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    sw t6, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(e)
-; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(f)
+; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(f)(a1)
 ; CHECK-RV32-NEXT:    call __addsf3
 ; CHECK-RV32-NEXT:    lui a1, %hi(d)
@@ -277,8 +277,8 @@ define void @foo_fp_float() nounwind #1 {
 ; CHECK-RV32-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi s0, sp, 80
 ; CHECK-RV32-NEXT:    lui a0, %hi(e)
-; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(f)
+; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(f)(a1)
 ; CHECK-RV32-NEXT:    call __addsf3
 ; CHECK-RV32-NEXT:    lui a1, %hi(d)
@@ -382,9 +382,9 @@ define void @foo_double() nounwind #0 {
 ; CHECK-RV32-NEXT:    sw t5, 4(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    sw t6, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a1, %hi(h)
+; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32-NEXT:    call __adddf3
@@ -450,9 +450,9 @@ define void @foo_double() nounwind #0 {
 ; CHECK-RV32IF-NEXT:    fsw ft10, 4(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    fsw ft11, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(h)
+; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32IF-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32IF-NEXT:    call __adddf3
@@ -549,9 +549,9 @@ define void @foo_fp_double() nounwind #1 {
 ; CHECK-RV32-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi s0, sp, 80
 ; CHECK-RV32-NEXT:    lui a1, %hi(h)
+; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32-NEXT:    call __adddf3
@@ -620,9 +620,9 @@ define void @foo_fp_double() nounwind #1 {
 ; CHECK-RV32IF-NEXT:    fsw ft11, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    addi s0, sp, 160
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(h)
+; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32IF-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32IF-NEXT:    call __adddf3
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 19f40dddeaec25..111b3e2bf82ce3 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -8,17 +8,17 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-LABEL: ctz_nxv4i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vid.v v11
+; RV32-NEXT:    vid.v v10
 ; RV32-NEXT:    li a1, -1
-; RV32-NEXT:    vmadd.vx v11, a1, v10
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
+; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vmadd.vx v10, a1, v8
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vvm v8, v8, v11, v0
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
@@ -29,17 +29,17 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV64-LABEL: ctz_nxv4i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    vid.v v11
+; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    vmadd.vx v11, a1, v10
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
+; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vmadd.vx v10, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v11, v0
+; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
@@ -75,28 +75,28 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vid.v v8
 ; RV32-NEXT:    li a2, -1
-; RV32-NEXT:    vmadd.vx v16, a2, v8
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vl2r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vmsne.vi v0, v8, 0
+; RV32-NEXT:    vmsne.vi v0, v24, 0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vmadd.vx v8, a2, v16
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vmerge.vim v16, v16, -1, v0
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    li a4, 32
+; RV32-NEXT:    vmv.x.s a3, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a4
+; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    sltu a2, a0, a3
 ; RV32-NEXT:    vmv.x.s a4, v8
 ; RV32-NEXT:    sub a1, a1, a4
-; RV32-NEXT:    sub a1, a1, a3
-; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    sub a0, a0, a3
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 1
 ; RV32-NEXT:    add sp, sp, a2
@@ -111,15 +111,15 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v16, a0
-; RV64-NEXT:    vid.v v24
+; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    vmadd.vx v24, a1, v16
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vmadd.vx v16, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v24, v0
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
index c7ba0e501fa44f..4a338ce5bd1f78 100644
--- a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
@@ -25,8 +25,8 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind {
 ; RV32I-LABEL: shifts_necmp_i16_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 16
-; RV32I-NEXT:    srli a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srli a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 8
 ; RV32I-NEXT:    srli a0, a0, 16
 ; RV32I-NEXT:    xor a0, a0, a1
@@ -36,8 +36,8 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind {
 ; RV64I-LABEL: shifts_necmp_i16_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 48
-; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 8
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    xor a0, a0, a1
@@ -638,10 +638,10 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, 128
 ; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sltiu a2, a2, 256
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    sltiu a1, a2, 256
-; RV32I-NEXT:    xori a1, a1, 1
+; RV32I-NEXT:    xori a1, a2, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
@@ -754,9 +754,9 @@ define i1 @add_ugecmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 ; RV32I-LABEL: add_ugecmp_bad_i16_i8_cmp:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a0, a0, 128
 ; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    addi a0, a0, 128
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a0, a1
 ; RV32I-NEXT:    xori a0, a0, 1
@@ -765,9 +765,9 @@ define i1 @add_ugecmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 ; RV64I-LABEL: add_ugecmp_bad_i16_i8_cmp:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addi a0, a0, 128
 ; RV64I-NEXT:    addiw a2, a2, -1
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    addi a0, a0, 128
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    sltu a0, a0, a1
 ; RV64I-NEXT:    xori a0, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
index 8693283e83712d..43719a452c2360 100644
--- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
+++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
@@ -32,19 +32,19 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    blez a1, .LBB0_3
 ; RV32-NEXT:  # %bb.1: # %cond_true.preheader
-; RV32-NEXT:    slli a0, a0, 6
-; RV32-NEXT:    lui a2, %hi(A)
-; RV32-NEXT:    addi a2, a2, %lo(A)
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    slli a2, a0, 6
+; RV32-NEXT:    lui a3, %hi(A)
+; RV32-NEXT:    addi a3, a3, %lo(A)
+; RV32-NEXT:    li a0, 4
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    addi a2, a2, 8
 ; RV32-NEXT:    li a3, 5
 ; RV32-NEXT:  .LBB0_2: # %cond_true
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    sw a2, -4(a0)
-; RV32-NEXT:    sw a3, 0(a0)
+; RV32-NEXT:    sw a0, -4(a2)
+; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    addi a0, a0, 4
+; RV32-NEXT:    addi a2, a2, 4
 ; RV32-NEXT:    bnez a1, .LBB0_2
 ; RV32-NEXT:  .LBB0_3: # %return
 ; RV32-NEXT:    ret
@@ -53,24 +53,24 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    blez a1, .LBB0_3
 ; RV64-NEXT:  # %bb.1: # %cond_true.preheader
-; RV64-NEXT:    slli a0, a0, 6
-; RV64-NEXT:    lui a2, %hi(A)
-; RV64-NEXT:    addi a2, a2, %lo(A)
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    addi a2, a0, 4
+; RV64-NEXT:    slli a3, a0, 6
+; RV64-NEXT:    lui a4, %hi(A)
+; RV64-NEXT:    addi a4, a4, %lo(A)
 ; RV64-NEXT:    addiw a1, a1, 2
-; RV64-NEXT:    li a3, 2
-; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a0, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    addi a4, a3, 4
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:  .LBB0_2: # %cond_true
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NEXT:    sw a4, 0(a2)
-; RV64-NEXT:    slli a6, a3, 2
-; RV64-NEXT:    add a6, a0, a6
+; RV64-NEXT:    sw a2, 0(a4)
+; RV64-NEXT:    slli a6, a0, 2
+; RV64-NEXT:    addiw a0, a0, 1
+; RV64-NEXT:    add a6, a3, a6
 ; RV64-NEXT:    sw a5, 0(a6)
-; RV64-NEXT:    addiw a3, a3, 1
-; RV64-NEXT:    addi a2, a2, 4
-; RV64-NEXT:    bne a3, a1, .LBB0_2
+; RV64-NEXT:    addi a4, a4, 4
+; RV64-NEXT:    bne a0, a1, .LBB0_2
 ; RV64-NEXT:  .LBB0_3: # %return
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll b/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
index 2c8839683d8162..b2ccbd821eb8ea 100644
--- a/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
+++ b/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
@@ -20,10 +20,10 @@ define i32 @main() nounwind {
 ; RV32I-NEXT:  .LBB0_1: # %for.body
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    addi a4, a0, -2048
-; RV32I-NEXT:    sw a4, 0(a2)
 ; RV32I-NEXT:    sw a0, 0(a1)
 ; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, 4
+; RV32I-NEXT:    sw a4, 0(a2)
 ; RV32I-NEXT:    addi a2, a2, 4
 ; RV32I-NEXT:    bne a0, a3, .LBB0_1
 ; RV32I-NEXT:  # %bb.2: # %for.end
diff --git a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
index e30bdfb939471f..83e9bf661ab1ca 100644
--- a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
+++ b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
@@ -13,10 +13,10 @@ define i32 @test(ptr %a, i64 %n)  {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v9, (a0)
 ; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vredsum.vs v9, v9, v8
 ; CHECK-NEXT:    vmv.x.s a3, v9
 ; CHECK-NEXT:    addw a3, a3, a3
-; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, 8
 ; CHECK-NEXT:    bnez a1, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %exit
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index e852579c724f8e..d529ae6ecd0aba 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2283,8 +2283,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
@@ -2294,8 +2294,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -2305,8 +2305,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
@@ -2316,8 +2316,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -2452,8 +2452,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -2472,8 +2472,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -2492,8 +2492,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -2512,8 +2512,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -2668,8 +2668,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
@@ -2692,8 +2692,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
@@ -2848,8 +2848,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -2886,8 +2886,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3037,8 +3037,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
@@ -3055,16 +3055,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -3087,8 +3087,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
@@ -3105,16 +3105,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -3278,16 +3278,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -3326,16 +3326,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -4449,25 +4449,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    ret
@@ -4475,25 +4475,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -4501,25 +4501,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
@@ -4527,25 +4527,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -4556,16 +4556,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a7, a0
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a5, a6
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
@@ -4576,20 +4576,20 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a3, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -4597,25 +4597,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
@@ -4623,25 +4623,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -4784,8 +4784,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -4804,8 +4804,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -4962,8 +4962,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -4982,8 +4982,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index f0c14ccb0d5f23..860c3a94abc0a7 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -1410,20 +1410,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-NEXT:    or a4, a4, a5
@@ -1466,20 +1466,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a4, a4, a5
@@ -1522,20 +1522,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a4, a4, a5
@@ -1578,20 +1578,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
@@ -1719,20 +1719,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-NEXT:    or a4, a4, a5
@@ -1775,20 +1775,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a4, a4, a5
@@ -1831,20 +1831,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a4, a4, a5
@@ -1887,20 +1887,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
@@ -1998,20 +1998,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-NEXT:    or a4, a4, a5
@@ -2034,20 +2034,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a4, a4, a5
@@ -2070,20 +2070,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a4, a4, a5
@@ -2106,20 +2106,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
@@ -2197,20 +2197,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-NEXT:    or a4, a4, a5
@@ -2233,20 +2233,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a4, a4, a5
@@ -2269,20 +2269,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a4, a4, a5
@@ -2305,20 +2305,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
@@ -2979,8 +2979,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
@@ -2990,8 +2990,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -3001,8 +3001,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
@@ -3012,8 +3012,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -3148,8 +3148,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -3168,8 +3168,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -3188,8 +3188,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3208,8 +3208,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3364,8 +3364,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
@@ -3388,8 +3388,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
@@ -3544,8 +3544,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -3582,8 +3582,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3733,8 +3733,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
@@ -3751,16 +3751,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -3783,8 +3783,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
@@ -3801,16 +3801,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -3974,16 +3974,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -4022,16 +4022,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -5809,25 +5809,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    ret
@@ -5835,25 +5835,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -5861,25 +5861,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
@@ -5887,25 +5887,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -5916,16 +5916,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a7, a0
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a5, a6
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
@@ -5936,20 +5936,20 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a3, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -5957,25 +5957,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
@@ -5983,25 +5983,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -6144,8 +6144,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -6164,8 +6164,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -6322,8 +6322,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -6342,8 +6342,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 5b8955ee0e0a0c..1ab3722080f700 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -26,8 +26,8 @@ define i32 @t0() {
 ; RV32-NEXT:    lui a0, %hi(src)
 ; RV32-NEXT:    lw a1, %lo(src)(a0)
 ; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    addi a0, a0, %lo(src)
+; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lh a3, 8(a0)
 ; RV32-NEXT:    lbu a0, 10(a0)
@@ -41,13 +41,13 @@ define i32 @t0() {
 ; RV64-LABEL: t0:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lui a0, %hi(src)
-; RV64-NEXT:    ld a1, %lo(src)(a0)
-; RV64-NEXT:    lui a2, %hi(dst)
+; RV64-NEXT:    lui a1, %hi(dst)
+; RV64-NEXT:    ld a2, %lo(src)(a0)
 ; RV64-NEXT:    addi a0, a0, %lo(src)
 ; RV64-NEXT:    lh a3, 8(a0)
 ; RV64-NEXT:    lbu a0, 10(a0)
-; RV64-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-NEXT:    addi a1, a2, %lo(dst)
+; RV64-NEXT:    sd a2, %lo(dst)(a1)
+; RV64-NEXT:    addi a1, a1, %lo(dst)
 ; RV64-NEXT:    sh a3, 8(a1)
 ; RV64-NEXT:    sb a0, 10(a1)
 ; RV64-NEXT:    li a0, 0
@@ -103,29 +103,29 @@ define void @t1(ptr nocapture %C) nounwind {
 ; RV32-FAST-LABEL: t1:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a1, 1141
+; RV32-FAST-NEXT:    lui a2, 300325
+; RV32-FAST-NEXT:    lui a3, 132181
+; RV32-FAST-NEXT:    lui a4, 340483
+; RV32-FAST-NEXT:    lui a5, 267556
+; RV32-FAST-NEXT:    lui a6, 337154
 ; RV32-FAST-NEXT:    addi a1, a1, -439
 ; RV32-FAST-NEXT:    sw a1, 27(a0)
-; RV32-FAST-NEXT:    lui a1, 300325
+; RV32-FAST-NEXT:    lui a1, 320757
+; RV32-FAST-NEXT:    addi a2, a2, 1107
+; RV32-FAST-NEXT:    addi a3, a3, -689
+; RV32-FAST-NEXT:    addi a4, a4, -947
+; RV32-FAST-NEXT:    sw a4, 16(a0)
+; RV32-FAST-NEXT:    sw a3, 20(a0)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lui a2, 365861
+; RV32-FAST-NEXT:    addi a3, a5, 1871
+; RV32-FAST-NEXT:    addi a4, a6, 69
 ; RV32-FAST-NEXT:    addi a1, a1, 1107
-; RV32-FAST-NEXT:    lui a2, 132181
-; RV32-FAST-NEXT:    addi a2, a2, -689
-; RV32-FAST-NEXT:    lui a3, 340483
-; RV32-FAST-NEXT:    addi a3, a3, -947
-; RV32-FAST-NEXT:    sw a3, 16(a0)
-; RV32-FAST-NEXT:    sw a2, 20(a0)
-; RV32-FAST-NEXT:    sw a1, 24(a0)
-; RV32-FAST-NEXT:    lui a1, 267556
-; RV32-FAST-NEXT:    addi a1, a1, 1871
-; RV32-FAST-NEXT:    lui a2, 337154
-; RV32-FAST-NEXT:    addi a2, a2, 69
-; RV32-FAST-NEXT:    lui a3, 320757
-; RV32-FAST-NEXT:    addi a3, a3, 1107
-; RV32-FAST-NEXT:    lui a4, 365861
-; RV32-FAST-NEXT:    addi a4, a4, -1980
-; RV32-FAST-NEXT:    sw a4, 0(a0)
-; RV32-FAST-NEXT:    sw a3, 4(a0)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    addi a2, a2, -1980
+; RV32-FAST-NEXT:    sw a2, 0(a0)
+; RV32-FAST-NEXT:    sw a1, 4(a0)
+; RV32-FAST-NEXT:    sw a4, 8(a0)
+; RV32-FAST-NEXT:    sw a3, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t1:
@@ -164,16 +164,16 @@ define void @t2(ptr nocapture %C) nounwind {
 ; RV64-FAST-LABEL: t2:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
-; RV64-FAST-NEXT:    ld a2, %lo(.L.str2)(a1)
-; RV64-FAST-NEXT:    lui a3, 1156
-; RV64-FAST-NEXT:    addi a3, a3, 332
-; RV64-FAST-NEXT:    sw a3, 32(a0)
+; RV64-FAST-NEXT:    lui a2, 1156
+; RV64-FAST-NEXT:    ld a3, %lo(.L.str2)(a1)
+; RV64-FAST-NEXT:    addi a2, a2, 332
 ; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    ld a3, 8(a1)
+; RV64-FAST-NEXT:    sw a2, 32(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
 ; RV64-FAST-NEXT:    ld a4, 16(a1)
 ; RV64-FAST-NEXT:    ld a1, 24(a1)
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sd a3, 8(a0)
+; RV64-FAST-NEXT:    sd a3, 0(a0)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    sd a4, 16(a0)
 ; RV64-FAST-NEXT:    sd a1, 24(a0)
 ; RV64-FAST-NEXT:    ret
@@ -200,23 +200,23 @@ define void @t3(ptr nocapture %C) nounwind {
 ; RV32-FAST-LABEL: t3:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a1, 1109
-; RV32-FAST-NEXT:    addi a1, a1, -689
 ; RV32-FAST-NEXT:    lui a2, 340483
+; RV32-FAST-NEXT:    lui a3, 267556
+; RV32-FAST-NEXT:    lui a4, 337154
+; RV32-FAST-NEXT:    lui a5, 320757
+; RV32-FAST-NEXT:    addi a1, a1, -689
 ; RV32-FAST-NEXT:    addi a2, a2, -947
 ; RV32-FAST-NEXT:    sw a2, 16(a0)
 ; RV32-FAST-NEXT:    sw a1, 20(a0)
-; RV32-FAST-NEXT:    lui a1, 267556
-; RV32-FAST-NEXT:    addi a1, a1, 1871
-; RV32-FAST-NEXT:    lui a2, 337154
-; RV32-FAST-NEXT:    addi a2, a2, 69
-; RV32-FAST-NEXT:    lui a3, 320757
-; RV32-FAST-NEXT:    addi a3, a3, 1107
-; RV32-FAST-NEXT:    lui a4, 365861
-; RV32-FAST-NEXT:    addi a4, a4, -1980
-; RV32-FAST-NEXT:    sw a4, 0(a0)
-; RV32-FAST-NEXT:    sw a3, 4(a0)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a2, a3, 1871
+; RV32-FAST-NEXT:    addi a3, a4, 69
+; RV32-FAST-NEXT:    addi a4, a5, 1107
+; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t3:
@@ -253,19 +253,19 @@ define void @t4(ptr nocapture %C) nounwind {
 ; RV32-FAST-LABEL: t4:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    li a1, 32
+; RV32-FAST-NEXT:    lui a2, 132388
+; RV32-FAST-NEXT:    lui a3, 337154
+; RV32-FAST-NEXT:    lui a4, 320757
 ; RV32-FAST-NEXT:    sh a1, 16(a0)
-; RV32-FAST-NEXT:    lui a1, 132388
-; RV32-FAST-NEXT:    addi a1, a1, 1871
-; RV32-FAST-NEXT:    lui a2, 337154
-; RV32-FAST-NEXT:    addi a2, a2, 69
-; RV32-FAST-NEXT:    lui a3, 320757
-; RV32-FAST-NEXT:    addi a3, a3, 1107
-; RV32-FAST-NEXT:    lui a4, 365861
-; RV32-FAST-NEXT:    addi a4, a4, -1980
-; RV32-FAST-NEXT:    sw a4, 0(a0)
-; RV32-FAST-NEXT:    sw a3, 4(a0)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a2, a2, 1871
+; RV32-FAST-NEXT:    addi a3, a3, 69
+; RV32-FAST-NEXT:    addi a4, a4, 1107
+; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t4:
@@ -289,34 +289,34 @@ define void @t5(ptr nocapture %C) nounwind {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 84
 ; RV32-NEXT:    li a2, 83
+; RV32-NEXT:    li a3, 89
+; RV32-NEXT:    li a4, 82
+; RV32-NEXT:    li a5, 72
+; RV32-NEXT:    li a6, 68
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    sb a1, 5(a0)
 ; RV32-NEXT:    sb zero, 6(a0)
-; RV32-NEXT:    li a1, 89
-; RV32-NEXT:    li a2, 82
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    li a4, 68
-; RV32-NEXT:    sb a4, 0(a0)
-; RV32-NEXT:    sb a3, 1(a0)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    sb a6, 0(a0)
+; RV32-NEXT:    sb a5, 1(a0)
+; RV32-NEXT:    sb a4, 2(a0)
+; RV32-NEXT:    sb a3, 3(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: t5:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a1, 84
 ; RV64-NEXT:    li a2, 83
+; RV64-NEXT:    li a3, 89
+; RV64-NEXT:    li a4, 82
+; RV64-NEXT:    li a5, 72
+; RV64-NEXT:    li a6, 68
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    sb a1, 5(a0)
 ; RV64-NEXT:    sb zero, 6(a0)
-; RV64-NEXT:    li a1, 89
-; RV64-NEXT:    li a2, 82
-; RV64-NEXT:    li a3, 72
-; RV64-NEXT:    li a4, 68
-; RV64-NEXT:    sb a4, 0(a0)
-; RV64-NEXT:    sb a3, 1(a0)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    sb a1, 3(a0)
+; RV64-NEXT:    sb a6, 0(a0)
+; RV64-NEXT:    sb a5, 1(a0)
+; RV64-NEXT:    sb a4, 2(a0)
+; RV64-NEXT:    sb a3, 3(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: t5:
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index e9b84b3cd97ed2..548c7e1c6ea8ce 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -142,8 +142,8 @@ define i64 @mul64(i64 %a, i64 %b) nounwind {
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    mul a3, a0, a3
 ; RV32IM-NEXT:    mulhu a4, a0, a2
-; RV32IM-NEXT:    add a3, a4, a3
 ; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a3, a4, a3
 ; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    mul a0, a0, a2
 ; RV32IM-NEXT:    ret
@@ -163,26 +163,25 @@ define i64 @mul64(i64 %a, i64 %b) nounwind {
 define i64 @mul64_constant(i64 %a) nounwind {
 ; RV32I-LABEL: mul64_constant:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a3, a0, 2
-; RV32I-NEXT:    add a2, a3, a0
-; RV32I-NEXT:    sltu a3, a2, a3
-; RV32I-NEXT:    srli a0, a0, 30
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    srli a3, a0, 30
 ; RV32I-NEXT:    slli a4, a1, 2
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a1, a0, a3
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: mul64_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    li a2, 5
-; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 2
 ; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    slli a3, a0, 2
+; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    slli a2, a0, 2
-; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    add a0, a3, a0
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: mul64_constant:
@@ -251,13 +250,13 @@ define i32 @mulhs_positive_constant(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    slli a2, a0, 2
-; RV32I-NEXT:    add a3, a2, a0
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    srli a0, a0, 30
-; RV32I-NEXT:    slli a3, a1, 2
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    srli a3, a0, 30
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    sltu a0, a0, a2
+; RV32I-NEXT:    slli a2, a1, 2
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: mulhs_positive_constant:
@@ -293,15 +292,15 @@ define i32 @mulhs_negative_constant(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    slli a2, a0, 2
-; RV32I-NEXT:    add a3, a2, a0
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    srli a0, a0, 30
+; RV32I-NEXT:    srli a3, a0, 30
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    slli a4, a1, 2
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
@@ -704,26 +703,25 @@ define i32 @muli32_p63(i32 %a) nounwind {
 define i64 @muli64_p65(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_p65:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a3, a0, 6
-; RV32I-NEXT:    add a2, a3, a0
-; RV32I-NEXT:    sltu a3, a2, a3
-; RV32I-NEXT:    srli a0, a0, 26
+; RV32I-NEXT:    slli a2, a0, 6
+; RV32I-NEXT:    srli a3, a0, 26
 ; RV32I-NEXT:    slli a4, a1, 6
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a1, a0, a3
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p65:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    li a2, 65
-; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 6
 ; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    slli a3, a0, 6
+; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    slli a2, a0, 6
-; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    add a0, a3, a0
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muli64_p65:
@@ -745,24 +743,24 @@ define i64 @muli64_p63(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_p63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 6
-; RV32I-NEXT:    sltu a3, a2, a0
-; RV32I-NEXT:    srli a4, a0, 26
-; RV32I-NEXT:    slli a5, a1, 6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    sub a1, a4, a1
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    srli a3, a0, 26
+; RV32I-NEXT:    slli a4, a1, 6
+; RV32I-NEXT:    sltu a5, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sub a1, a3, a1
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p63:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    li a2, 63
-; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 6
 ; RV32IM-NEXT:    sub a1, a3, a1
+; RV32IM-NEXT:    slli a3, a0, 6
+; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    slli a2, a0, 6
-; RV32IM-NEXT:    sub a0, a2, a0
+; RV32IM-NEXT:    sub a0, a3, a0
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muli64_p63:
@@ -846,12 +844,12 @@ define i64 @muli64_m63(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_m63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 6
-; RV32I-NEXT:    sltu a3, a0, a2
-; RV32I-NEXT:    srli a4, a0, 26
-; RV32I-NEXT:    slli a5, a1, 6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    srli a3, a0, 26
+; RV32I-NEXT:    slli a4, a1, 6
+; RV32I-NEXT:    sltu a5, a0, a2
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -886,17 +884,17 @@ define i64 @muli64_m65(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_m65:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 6
-; RV32I-NEXT:    add a3, a2, a0
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    srli a0, a0, 26
+; RV32I-NEXT:    srli a3, a0, 26
 ; RV32I-NEXT:    slli a4, a1, 6
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a0
-; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    snez a3, a0
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    neg a2, a3
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_m65:
@@ -1129,11 +1127,11 @@ define i64 @muli64_p4352(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 20
 ; RV32I-NEXT:    slli a1, a1, 12
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    slli a4, a0, 12
+; RV32I-NEXT:    add a0, a4, a3
 ; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    slli a3, a0, 12
-; RV32I-NEXT:    add a0, a3, a2
-; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    sltu a2, a0, a4
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -1173,12 +1171,12 @@ define i64 @muli64_p3840(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 20
 ; RV32I-NEXT:    slli a1, a1, 12
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a0, 8
 ; RV32I-NEXT:    slli a0, a0, 12
-; RV32I-NEXT:    sltu a3, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p3840:
@@ -1261,12 +1259,12 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 12
+; RV32I-NEXT:    slli a3, a0, 12
 ; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    sltu a3, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_m3840:
@@ -1300,105 +1298,103 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m3840:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw a6, 0(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    srli a1, a4, 20
-; RV32I-NEXT:    slli a2, a3, 12
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a4, 24
-; RV32I-NEXT:    slli a7, a3, 8
-; RV32I-NEXT:    or a2, a7, a2
-; RV32I-NEXT:    sltu t0, a2, a1
-; RV32I-NEXT:    srli a7, a3, 20
-; RV32I-NEXT:    slli t1, a5, 12
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    sub t1, a3, a7
-; RV32I-NEXT:    srli a3, a6, 20
-; RV32I-NEXT:    slli a5, a4, 12
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    srli a1, a3, 20
+; RV32I-NEXT:    slli a6, a2, 12
+; RV32I-NEXT:    srli a7, a3, 24
+; RV32I-NEXT:    slli t0, a2, 8
+; RV32I-NEXT:    srli t1, a2, 20
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    slli a6, a4, 12
+; RV32I-NEXT:    srli t2, a2, 24
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    slli a4, a6, 12
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    sltu a7, a6, a4
-; RV32I-NEXT:    sub t0, t1, t0
+; RV32I-NEXT:    or a2, t0, a7
+; RV32I-NEXT:    srli a7, a5, 20
+; RV32I-NEXT:    or a6, a6, t1
+; RV32I-NEXT:    slli t0, a3, 12
+; RV32I-NEXT:    or t1, a4, t2
+; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    slli t3, a3, 8
+; RV32I-NEXT:    or a3, t0, a7
+; RV32I-NEXT:    slli a4, a5, 12
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    sltu t2, a2, a1
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sltu a7, a5, a4
+; RV32I-NEXT:    sub a6, a6, t2
 ; RV32I-NEXT:    mv t1, a7
-; RV32I-NEXT:    beq a5, a3, .LBB36_2
+; RV32I-NEXT:    beq t0, a3, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a5, a3
+; RV32I-NEXT:    sltu t1, t0, a3
 ; RV32I-NEXT:  .LBB36_2:
 ; RV32I-NEXT:    sub a2, a2, a1
-; RV32I-NEXT:    sltu a1, a2, t1
-; RV32I-NEXT:    sub a1, t0, a1
+; RV32I-NEXT:    sub a1, t0, a3
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sltu a3, a2, t1
 ; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a5, a5, a3
-; RV32I-NEXT:    sub a3, a5, a7
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sub a1, a1, a7
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    sw a5, 0(a0)
+; RV32I-NEXT:    sw a1, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m3840:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 0(a1)
-; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a2, 4(a1)
 ; RV32IM-NEXT:    lw a4, 8(a1)
 ; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -15
+; RV32IM-NEXT:    li a6, -1
 ; RV32IM-NEXT:    slli a5, a5, 8
-; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    mul a7, a3, a5
-; RV32IM-NEXT:    add a6, a7, a6
-; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a3, a5
-; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a2
-; RV32IM-NEXT:    neg t0, a2
-; RV32IM-NEXT:    sltu t1, a6, t0
-; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a2, t2
-; RV32IM-NEXT:    add t1, t3, t1
-; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a3
-; RV32IM-NEXT:    mul t5, a4, a5
-; RV32IM-NEXT:    sub t5, t5, a2
-; RV32IM-NEXT:    add t6, t4, t5
-; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a3
-; RV32IM-NEXT:    sltu t4, t4, s1
-; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a3, t2
-; RV32IM-NEXT:    add a7, t1, a7
-; RV32IM-NEXT:    add a7, a7, t4
-; RV32IM-NEXT:    sltu t0, t5, t0
+; RV32IM-NEXT:    mulhu a7, a3, a5
+; RV32IM-NEXT:    mul t0, a2, a5
+; RV32IM-NEXT:    mulhu t1, a2, a5
+; RV32IM-NEXT:    neg t2, a3
+; RV32IM-NEXT:    mulhu t3, a3, a6
+; RV32IM-NEXT:    mul t4, a4, a5
+; RV32IM-NEXT:    neg t5, a2
+; RV32IM-NEXT:    mulhu a6, a2, a6
 ; RV32IM-NEXT:    mul a1, a1, a5
-; RV32IM-NEXT:    mulhu t1, a4, a5
-; RV32IM-NEXT:    sub a4, t1, a4
+; RV32IM-NEXT:    mulhu t6, a4, a5
+; RV32IM-NEXT:    add s0, a3, a2
+; RV32IM-NEXT:    mul a5, a3, a5
+; RV32IM-NEXT:    add a7, t0, a7
+; RV32IM-NEXT:    sub t4, t4, a3
+; RV32IM-NEXT:    sub a4, t6, a4
+; RV32IM-NEXT:    sub t6, t3, s0
+; RV32IM-NEXT:    sltu t0, a7, t0
+; RV32IM-NEXT:    sub a3, a7, a3
+; RV32IM-NEXT:    sltu a7, t4, t2
 ; RV32IM-NEXT:    add a1, a4, a1
-; RV32IM-NEXT:    add a3, a2, a3
-; RV32IM-NEXT:    sub a3, t3, a3
-; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    add a1, a1, t0
-; RV32IM-NEXT:    add a1, a7, a1
-; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    sw a2, 0(a0)
-; RV32IM-NEXT:    sw a6, 4(a0)
-; RV32IM-NEXT:    sw t6, 8(a0)
+; RV32IM-NEXT:    add t0, t1, t0
+; RV32IM-NEXT:    sltu a4, a3, t2
+; RV32IM-NEXT:    add a1, t6, a1
+; RV32IM-NEXT:    add a4, t3, a4
+; RV32IM-NEXT:    add a1, a1, a7
+; RV32IM-NEXT:    add a4, t0, a4
+; RV32IM-NEXT:    sub a2, a4, a2
+; RV32IM-NEXT:    sltu a4, a4, t0
+; RV32IM-NEXT:    add t4, a2, t4
+; RV32IM-NEXT:    sltu a7, a2, t5
+; RV32IM-NEXT:    add a4, a6, a4
+; RV32IM-NEXT:    sltu a2, t4, a2
+; RV32IM-NEXT:    add a4, a4, a7
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    sw a5, 0(a0)
+; RV32IM-NEXT:    sw a3, 4(a0)
+; RV32IM-NEXT:    sw t4, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
 ; RV32IM-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32IM-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    addi sp, sp, 16
 ; RV32IM-NEXT:    ret
 ;
@@ -1410,12 +1406,12 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    slli a2, a0, 12
+; RV64I-NEXT:    slli a3, a0, 12
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    sltu a3, a0, a2
-; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sltu a2, a0, a3
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a3
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli128_m3840:
@@ -1435,40 +1431,40 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m63:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    slli a1, a2, 6
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    srli a7, a2, 26
-; RV32I-NEXT:    slli t0, a3, 6
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    mv t0, a4
-; RV32I-NEXT:    beq a3, a7, .LBB37_2
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    slli a6, a3, 6
+; RV32I-NEXT:    srli a5, a3, 26
+; RV32I-NEXT:    slli t0, a4, 6
+; RV32I-NEXT:    sltu a7, a3, a6
+; RV32I-NEXT:    or t0, t0, a5
+; RV32I-NEXT:    mv a5, a7
+; RV32I-NEXT:    beq a4, t0, .LBB37_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a3, a7
+; RV32I-NEXT:    sltu a5, a4, t0
 ; RV32I-NEXT:  .LBB37_2:
-; RV32I-NEXT:    srli t1, a3, 26
-; RV32I-NEXT:    slli t2, a6, 6
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    sub t2, a6, t1
-; RV32I-NEXT:    sltu t3, t2, t0
-; RV32I-NEXT:    sltu t1, a6, t1
-; RV32I-NEXT:    srli a6, a6, 26
-; RV32I-NEXT:    slli t4, a5, 6
-; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    sub a5, a5, a6
-; RV32I-NEXT:    sub a5, a5, t1
-; RV32I-NEXT:    sub a5, a5, t3
-; RV32I-NEXT:    sub a6, t2, t0
-; RV32I-NEXT:    sub a3, a3, a7
-; RV32I-NEXT:    sub a3, a3, a4
-; RV32I-NEXT:    sub a2, a2, a1
-; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    srli t1, a4, 26
+; RV32I-NEXT:    slli t2, a2, 6
+; RV32I-NEXT:    srli t3, a2, 26
+; RV32I-NEXT:    slli t4, a1, 6
+; RV32I-NEXT:    sub a4, a4, t0
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a7, a2, a6
+; RV32I-NEXT:    sltu a2, a2, a6
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sltu a6, a7, a5
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a2, a7, a5
+; RV32I-NEXT:    sub a1, a1, a6
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m63:
@@ -1476,54 +1472,54 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 0(a1)
-; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a2, 4(a1)
 ; RV32IM-NEXT:    lw a4, 8(a1)
 ; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -63
-; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    slli a7, a3, 6
-; RV32IM-NEXT:    sub a7, a3, a7
-; RV32IM-NEXT:    add a6, a7, a6
-; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a3, a5
-; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a2
-; RV32IM-NEXT:    neg t0, a2
-; RV32IM-NEXT:    sltu t1, a6, t0
-; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a2, t2
-; RV32IM-NEXT:    add t1, t3, t1
-; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a3
-; RV32IM-NEXT:    slli t5, a4, 6
-; RV32IM-NEXT:    sub t6, a4, a2
-; RV32IM-NEXT:    sub t5, t6, t5
-; RV32IM-NEXT:    add t6, t4, t5
-; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a3
-; RV32IM-NEXT:    sltu t4, t4, s1
-; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a3, t2
-; RV32IM-NEXT:    add a7, t1, a7
-; RV32IM-NEXT:    add a7, a7, t4
-; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a1, 6
-; RV32IM-NEXT:    sub a1, a1, t1
+; RV32IM-NEXT:    li a6, -1
+; RV32IM-NEXT:    mulhu a7, a3, a5
+; RV32IM-NEXT:    slli t0, a2, 6
+; RV32IM-NEXT:    mulhu t1, a2, a5
+; RV32IM-NEXT:    neg t2, a3
+; RV32IM-NEXT:    mulhu t3, a3, a6
+; RV32IM-NEXT:    slli t4, a4, 6
+; RV32IM-NEXT:    sub t5, a4, a3
+; RV32IM-NEXT:    neg t6, a2
+; RV32IM-NEXT:    mulhu a6, a2, a6
+; RV32IM-NEXT:    slli s0, a1, 6
 ; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    add s1, a3, a2
+; RV32IM-NEXT:    sub t4, t5, t4
+; RV32IM-NEXT:    slli t5, a3, 6
+; RV32IM-NEXT:    sub t0, a2, t0
+; RV32IM-NEXT:    sub a1, a1, s0
 ; RV32IM-NEXT:    sub a5, a5, a4
+; RV32IM-NEXT:    sub a4, t3, s1
+; RV32IM-NEXT:    sub t5, a3, t5
+; RV32IM-NEXT:    add a7, t0, a7
+; RV32IM-NEXT:    sltu s0, t4, t2
 ; RV32IM-NEXT:    add a1, a5, a1
-; RV32IM-NEXT:    add a3, a2, a3
-; RV32IM-NEXT:    sub a3, t3, a3
-; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    add a1, a1, t0
-; RV32IM-NEXT:    add a1, a7, a1
+; RV32IM-NEXT:    sltu a5, a7, t0
+; RV32IM-NEXT:    sub a3, a7, a3
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a5, t1, a5
+; RV32IM-NEXT:    sltu a4, a3, t2
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    slli a3, a2, 6
-; RV32IM-NEXT:    sub a2, a2, a3
-; RV32IM-NEXT:    sw a2, 0(a0)
-; RV32IM-NEXT:    sw a6, 4(a0)
-; RV32IM-NEXT:    sw t6, 8(a0)
+; RV32IM-NEXT:    add a4, t3, a4
+; RV32IM-NEXT:    add a4, a5, a4
+; RV32IM-NEXT:    sub a2, a4, a2
+; RV32IM-NEXT:    sltu a4, a4, a5
+; RV32IM-NEXT:    add t4, a2, t4
+; RV32IM-NEXT:    sltu a5, a2, t6
+; RV32IM-NEXT:    add a4, a6, a4
+; RV32IM-NEXT:    sltu a2, t4, a2
+; RV32IM-NEXT:    add a4, a4, a5
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    sw t5, 0(a0)
+; RV32IM-NEXT:    sw a3, 4(a0)
+; RV32IM-NEXT:    sw t4, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
 ; RV32IM-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -1533,12 +1529,12 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV64I-LABEL: muli128_m63:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 6
-; RV64I-NEXT:    sltu a3, a0, a2
-; RV64I-NEXT:    srli a4, a0, 58
-; RV64I-NEXT:    slli a5, a1, 6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    srli a3, a0, 58
+; RV64I-NEXT:    slli a4, a1, 6
+; RV64I-NEXT:    sltu a5, a0, a2
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    sub a1, a1, a3
+; RV64I-NEXT:    sub a1, a1, a5
 ; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
@@ -1619,17 +1615,17 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    call __muldi3
 ; RV32I-NEXT:    add s2, a0, s2
-; RV32I-NEXT:    add a2, s9, s2
-; RV32I-NEXT:    sltu a3, a2, s9
-; RV32I-NEXT:    sltu a4, s9, s5
-; RV32I-NEXT:    sltu a5, s8, s7
-; RV32I-NEXT:    add a5, s6, a5
-; RV32I-NEXT:    add a4, a5, a4
+; RV32I-NEXT:    sltu a3, s9, s5
+; RV32I-NEXT:    sltu a4, s8, s7
 ; RV32I-NEXT:    add a1, a1, s3
+; RV32I-NEXT:    add a2, s9, s2
+; RV32I-NEXT:    add a4, s6, a4
 ; RV32I-NEXT:    sltu a0, s2, a0
+; RV32I-NEXT:    sltu a5, a2, s9
+; RV32I-NEXT:    add a3, a4, a3
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    add a1, a0, a3
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    add a1, a0, a5
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -1650,33 +1646,32 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32IM-NEXT:    srai a4, a3, 31
 ; RV32IM-NEXT:    mulhu a5, a0, a2
 ; RV32IM-NEXT:    mul a6, a1, a2
-; RV32IM-NEXT:    add a5, a6, a5
-; RV32IM-NEXT:    sltu a6, a5, a6
 ; RV32IM-NEXT:    mulhu a2, a1, a2
-; RV32IM-NEXT:    add a6, a2, a6
-; RV32IM-NEXT:    mul a2, a0, a3
-; RV32IM-NEXT:    add a5, a2, a5
-; RV32IM-NEXT:    sltu a2, a5, a2
-; RV32IM-NEXT:    mulhu a5, a0, a3
-; RV32IM-NEXT:    add a2, a5, a2
-; RV32IM-NEXT:    add a5, a6, a2
-; RV32IM-NEXT:    mul a7, a1, a3
-; RV32IM-NEXT:    add t0, a7, a5
-; RV32IM-NEXT:    mul t1, a4, a0
-; RV32IM-NEXT:    add a2, t0, t1
-; RV32IM-NEXT:    sltu t2, a2, t0
-; RV32IM-NEXT:    sltu a7, t0, a7
-; RV32IM-NEXT:    sltu a5, a5, a6
+; RV32IM-NEXT:    mul a7, a0, a3
+; RV32IM-NEXT:    mulhu t0, a0, a3
+; RV32IM-NEXT:    mul t1, a1, a3
 ; RV32IM-NEXT:    mulhu a3, a1, a3
-; RV32IM-NEXT:    add a3, a3, a5
-; RV32IM-NEXT:    add a3, a3, a7
+; RV32IM-NEXT:    add a5, a6, a5
+; RV32IM-NEXT:    mul t2, a4, a0
 ; RV32IM-NEXT:    mul a1, a4, a1
 ; RV32IM-NEXT:    mulhu a0, a4, a0
-; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    add a0, a0, t1
-; RV32IM-NEXT:    add a0, a3, a0
-; RV32IM-NEXT:    add a1, a0, t2
-; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:    sltu a4, a5, a6
+; RV32IM-NEXT:    add a5, a7, a5
+; RV32IM-NEXT:    add a1, a0, a1
+; RV32IM-NEXT:    add a2, a2, a4
+; RV32IM-NEXT:    sltu a0, a5, a7
+; RV32IM-NEXT:    add a0, t0, a0
+; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    add a4, t1, a0
+; RV32IM-NEXT:    sltu a2, a0, a2
+; RV32IM-NEXT:    add a0, a4, t2
+; RV32IM-NEXT:    sltu a5, a4, t1
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    sltu a3, a0, a4
+; RV32IM-NEXT:    add a2, a2, a5
+; RV32IM-NEXT:    add a1, a1, t2
+; RV32IM-NEXT:    add a1, a2, a1
+; RV32IM-NEXT:    add a1, a1, a3
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: mulhsu_i64:
diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 676b4134461163..7d6a6d7ed4ce64 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -81,8 +81,8 @@ define i64 @neg_abs64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -92,8 +92,8 @@ define i64 @neg_abs64(i64 %x) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
@@ -121,8 +121,8 @@ define i64 @select_neg_abs64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -132,8 +132,8 @@ define i64 @select_neg_abs64(i64 %x) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
diff --git a/llvm/test/CodeGen/RISCV/or-is-add.ll b/llvm/test/CodeGen/RISCV/or-is-add.ll
index 36a201d2776752..73561675b17ecf 100644
--- a/llvm/test/CodeGen/RISCV/or-is-add.ll
+++ b/llvm/test/CodeGen/RISCV/or-is-add.ll
@@ -58,8 +58,8 @@ define i64 @test4(i64 %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    srli a2, a0, 28
 ; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    addi a0, a0, 13
 ; RV32-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 6412f0c8ff1cf0..5a01d43fea56ba 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -693,13 +693,12 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:    sub a1, a1, a4
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    sw a5, 0(a2)
 ; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt:
@@ -721,13 +720,12 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt_dom:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:    sub a1, a1, a4
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    sw a5, 0(a2)
 ; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt_dom:
@@ -800,10 +798,10 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
 ; RV32-NEXT:    mv a5, a0
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a6, a1, a3
-; RV32-NEXT:    sub a6, a6, a0
 ; RV32-NEXT:    sub a5, a5, a2
+; RV32-NEXT:    sub a2, a6, a0
 ; RV32-NEXT:    sw a5, 0(a4)
-; RV32-NEXT:    sw a6, 4(a4)
+; RV32-NEXT:    sw a2, 4(a4)
 ; RV32-NEXT:    beq a1, a3, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    sltu a0, a1, a3
@@ -882,8 +880,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) {
 ; RV32-LABEL: usubo_ult_constant_op0_i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a2, a0, 16
-; RV32-NEXT:    srli a2, a2, 16
 ; RV32-NEXT:    li a3, 43
+; RV32-NEXT:    srli a2, a2, 16
 ; RV32-NEXT:    sub a3, a3, a0
 ; RV32-NEXT:    sltiu a0, a2, 44
 ; RV32-NEXT:    xori a0, a0, 1
@@ -893,8 +891,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) {
 ; RV64-LABEL: usubo_ult_constant_op0_i16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a2, a0, 48
-; RV64-NEXT:    srli a2, a2, 48
 ; RV64-NEXT:    li a3, 43
+; RV64-NEXT:    srli a2, a2, 48
 ; RV64-NEXT:    subw a3, a3, a0
 ; RV64-NEXT:    sltiu a0, a2, 44
 ; RV64-NEXT:    xori a0, a0, 1
@@ -1015,10 +1013,10 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    mv a7, a0
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub t0, a1, a3
-; RV32-NEXT:    sub t0, t0, a0
 ; RV32-NEXT:    sub a2, a7, a2
+; RV32-NEXT:    sub a7, t0, a0
 ; RV32-NEXT:    sw a2, 0(a4)
-; RV32-NEXT:    sw t0, 4(a4)
+; RV32-NEXT:    sw a7, 4(a4)
 ; RV32-NEXT:    beqz a6, .LBB31_5
 ; RV32-NEXT:  # %bb.2: # %end
 ; RV32-NEXT:    beq a1, a3, .LBB31_4
@@ -1108,9 +1106,9 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    sltu a0, s3, s2
 ; RV32-NEXT:  .LBB32_7: # %end
 ; RV32-NEXT:    sub a2, s3, s2
+; RV32-NEXT:    sub a3, s4, s1
 ; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    sub a1, s4, s1
-; RV32-NEXT:    sw a1, 0(s0)
+; RV32-NEXT:    sw a3, 0(s0)
 ; RV32-NEXT:    sw a2, 4(s0)
 ; RV32-NEXT:    j .LBB32_9
 ; RV32-NEXT:  .LBB32_8: # %f
@@ -1275,8 +1273,8 @@ define void @PR41129(ptr %p64) {
 ; RV32-NEXT:    ret
 ; RV32-NEXT:  .LBB37_2: # %true
 ; RV32-NEXT:    seqz a3, a1
-; RV32-NEXT:    sub a2, a2, a3
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sub a2, a2, a3
 ; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    ret
@@ -1316,9 +1314,9 @@ define i16 @overflow_not_used(i16 %a, i16 %b, ptr %res) {
 ; RV32-LABEL: overflow_not_used:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a4, a1, a3
-; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    and a3, a0, a3
 ; RV32-NEXT:    bltu a3, a4, .LBB38_2
 ; RV32-NEXT:  # %bb.1:
@@ -1331,9 +1329,9 @@ define i16 @overflow_not_used(i16 %a, i16 %b, ptr %res) {
 ; RV64-LABEL: overflow_not_used:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a3, 16
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    addiw a3, a3, -1
 ; RV64-NEXT:    and a4, a1, a3
-; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    and a3, a0, a3
 ; RV64-NEXT:    bltu a3, a4, .LBB38_2
 ; RV64-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index 8aa145f6ac5efa..8e858bdd29762f 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -12,21 +12,21 @@ define signext i32 @wobble() nounwind {
 ; CHECK-LABEL: wobble:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    lui a0, %hi(global)
+; CHECK-NEXT:    lui a1, %hi(global.1)
 ; CHECK-NEXT:    lbu a0, %lo(global)(a0)
-; CHECK-NEXT:    lui a1, %hi(global.2)
-; CHECK-NEXT:    lbu a1, %lo(global.2)(a1)
+; CHECK-NEXT:    lui a2, %hi(global.2)
+; CHECK-NEXT:    lui a3, 52429
+; CHECK-NEXT:    lbu a2, %lo(global.2)(a2)
 ; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    lui a2, %hi(global.1)
-; CHECK-NEXT:    sw a0, %lo(global.1)(a2)
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 48
-; CHECK-NEXT:    lui a2, 52429
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mulhu a1, a1, a2
-; CHECK-NEXT:    srli a1, a1, 18
-; CHECK-NEXT:    lui a2, %hi(global.3)
+; CHECK-NEXT:    sw a0, %lo(global.1)(a1)
+; CHECK-NEXT:    lui a1, %hi(global.3)
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 48
+; CHECK-NEXT:    mulhu a2, a2, a3
+; CHECK-NEXT:    srli a2, a2, 18
 ; CHECK-NEXT:    li a3, 5
-; CHECK-NEXT:    sw a1, %lo(global.3)(a2)
+; CHECK-NEXT:    sw a2, %lo(global.3)(a1)
 ; CHECK-NEXT:    bgeu a0, a3, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %bb12
 ; CHECK-NEXT:    li a0, 0
diff --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll
index ba08aa838bf992..cf518b31a190b7 100644
--- a/llvm/test/CodeGen/RISCV/pr56457.ll
+++ b/llvm/test/CodeGen/RISCV/pr56457.ll
@@ -10,41 +10,41 @@ define i15 @foo(i15 %x) nounwind {
 ; CHECK-NEXT:    beqz a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %cond.false
 ; CHECK-NEXT:    srli a1, a1, 50
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    lui a3, 209715
+; CHECK-NEXT:    lui a4, 61681
 ; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 49
-; CHECK-NEXT:    srli a1, a1, 51
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 49
-; CHECK-NEXT:    srli a1, a1, 53
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 49
-; CHECK-NEXT:    srli a1, a1, 57
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    addiw a1, a2, 1365
+; CHECK-NEXT:    addiw a2, a3, 819
+; CHECK-NEXT:    addiw a3, a4, -241
+; CHECK-NEXT:    slli a4, a2, 32
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    slli a4, a3, 32
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    slli a4, a0, 49
+; CHECK-NEXT:    srli a4, a4, 51
+; CHECK-NEXT:    or a0, a0, a4
+; CHECK-NEXT:    slli a4, a0, 49
+; CHECK-NEXT:    srli a4, a4, 53
+; CHECK-NEXT:    or a0, a0, a4
+; CHECK-NEXT:    slli a4, a0, 49
+; CHECK-NEXT:    srli a4, a4, 57
+; CHECK-NEXT:    or a0, a0, a4
 ; CHECK-NEXT:    not a0, a0
-; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    addiw a2, a2, 1365
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    srli a4, a0, 1
+; CHECK-NEXT:    and a1, a4, a1
 ; CHECK-NEXT:    slli a0, a0, 49
 ; CHECK-NEXT:    srli a0, a0, 49
 ; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addiw a1, a1, 819
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    and a2, a0, a1
+; CHECK-NEXT:    and a1, a0, a2
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    srli a1, a0, 4
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addiw a1, a1, -241
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    lui a1, 4112
 ; CHECK-NEXT:    addiw a1, a1, 257
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    slli a2, a1, 32
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    mul a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll
index df02d77f613290..e5cba679729fae 100644
--- a/llvm/test/CodeGen/RISCV/pr58511.ll
+++ b/llvm/test/CodeGen/RISCV/pr58511.ll
@@ -5,8 +5,8 @@ define i32 @f(i1 %0, i32 %1, ptr %2) {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    slli a0, a0, 63
-; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    lui a3, 4097
+; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    mul a1, a1, a3
@@ -24,8 +24,8 @@ define i32 @g(i1 %0, i32 %1, ptr %2) {
 ; CHECK-LABEL: g:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    andi a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    lui a3, 4097
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    mul a1, a1, a3
@@ -43,10 +43,10 @@ define i32 @h(i1 %0, i32 %1, ptr %2) {
 ; CHECK-LABEL: h:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    lui a3, 4097
-; CHECK-NEXT:    addiw a3, a3, -2047
-; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    sw a1, 0(a2)
 ; CHECK-NEXT:    ret
@@ -64,8 +64,8 @@ define i32 @i(i1 %0, i32 %1, ptr %2) {
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    lui a3, 4097
 ; CHECK-NEXT:    addiw a3, a3, -2047
-; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    sw a1, 0(a2)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/pr65025.ll b/llvm/test/CodeGen/RISCV/pr65025.ll
index dcd71edc460b8d..c6770b05da555e 100644
--- a/llvm/test/CodeGen/RISCV/pr65025.ll
+++ b/llvm/test/CodeGen/RISCV/pr65025.ll
@@ -7,10 +7,10 @@ define ptr @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %va
 ; CHECK-NEXT:    andi a3, a0, -4
 ; CHECK-NEXT:    slli a4, a0, 3
 ; CHECK-NEXT:    li a5, 255
-; CHECK-NEXT:    sllw a5, a5, a4
 ; CHECK-NEXT:    andi a1, a1, 255
-; CHECK-NEXT:    sllw a1, a1, a4
 ; CHECK-NEXT:    andi a2, a2, 255
+; CHECK-NEXT:    sllw a5, a5, a4
+; CHECK-NEXT:    sllw a1, a1, a4
 ; CHECK-NEXT:    sllw a2, a2, a4
 ; CHECK-NEXT:  .LBB0_3: # %do_cmpxchg
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/RISCV/pr68855.ll b/llvm/test/CodeGen/RISCV/pr68855.ll
index e9d1f6c2d1b2cc..8031bf4f30411b 100644
--- a/llvm/test/CodeGen/RISCV/pr68855.ll
+++ b/llvm/test/CodeGen/RISCV/pr68855.ll
@@ -6,10 +6,10 @@ define i16 @narrow_load(ptr %p1, ptr %p2) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lhu a2, 0(a0)
 ; CHECK-NEXT:    lui a3, 2
-; CHECK-NEXT:    addiw a3, a3, -1
-; CHECK-NEXT:    xor a2, a2, a3
 ; CHECK-NEXT:    lui a4, 16
+; CHECK-NEXT:    addiw a3, a3, -1
 ; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    xor a2, a2, a3
 ; CHECK-NEXT:    xor a4, a3, a4
 ; CHECK-NEXT:    or a2, a2, a4
 ; CHECK-NEXT:    sw a2, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 69746e3e70bfcf..9fc9a3c42867e7 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -7,21 +7,21 @@
 define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-LABEL: test:
 ; NOREMAT:       # %bb.0:
-; NOREMAT-NEXT:    addi sp, sp, -400
-; NOREMAT-NEXT:    .cfi_def_cfa_offset 400
-; NOREMAT-NEXT:    sd ra, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s0, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s1, 376(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s3, 360(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s4, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s5, 344(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s6, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s7, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s8, 320(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s9, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s10, 304(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s11, 296(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addi sp, sp, -752
+; NOREMAT-NEXT:    .cfi_def_cfa_offset 752
+; NOREMAT-NEXT:    sd ra, 744(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s0, 736(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s1, 728(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s2, 720(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s3, 712(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s4, 704(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s5, 696(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s6, 688(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s7, 680(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s8, 672(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s9, 664(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s10, 656(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s11, 648(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    .cfi_offset ra, -8
 ; NOREMAT-NEXT:    .cfi_offset s0, -16
 ; NOREMAT-NEXT:    .cfi_offset s1, -24
@@ -36,747 +36,845 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_offset s10, -96
 ; NOREMAT-NEXT:    .cfi_offset s11, -104
 ; NOREMAT-NEXT:    csrr a2, vlenb
-; NOREMAT-NEXT:    li a3, 6
-; NOREMAT-NEXT:    mul a2, a2, a3
+; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
-; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x03, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 400 + 6 * vlenb
-; NOREMAT-NEXT:    li a2, 32
-; NOREMAT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; NOREMAT-NEXT:    vle32.v v8, (a0)
-; NOREMAT-NEXT:    addi a2, a0, 512
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    addi a2, a0, 1024
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    addi a2, a0, 1536
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    li a2, 1
-; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 272(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a5, 5
-; NOREMAT-NEXT:    slli a2, a5, 9
-; NOREMAT-NEXT:    sd a2, 264(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    li a2, 3
-; NOREMAT-NEXT:    slli a3, a2, 10
-; NOREMAT-NEXT:    sd a3, 256(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    li a4, 7
-; NOREMAT-NEXT:    slli a3, a4, 9
-; NOREMAT-NEXT:    sd a3, 248(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    lui a3, 1
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    li a3, 9
-; NOREMAT-NEXT:    slli a6, a3, 9
-; NOREMAT-NEXT:    sd a6, 240(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v14, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    slli a6, a5, 10
-; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v12, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a6)
-; NOREMAT-NEXT:    li s8, 11
-; NOREMAT-NEXT:    slli a6, s8, 9
-; NOREMAT-NEXT:    sd a6, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v14, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li s2, 13
-; NOREMAT-NEXT:    slli a2, s2, 9
-; NOREMAT-NEXT:    sd a2, 208(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    slli a2, a4, 10
-; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a2, 15
-; NOREMAT-NEXT:    slli a6, a2, 9
-; NOREMAT-NEXT:    sd a6, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v26, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v16, (a6)
-; NOREMAT-NEXT:    lui a6, 2
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v28, (a6)
-; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    li a6, 17
-; NOREMAT-NEXT:    slli a6, a6, 9
-; NOREMAT-NEXT:    sd a6, 184(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t0, 17
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v30, (a6)
-; NOREMAT-NEXT:    vle32.v v18, (a6)
-; NOREMAT-NEXT:    slli a6, a3, 10
-; NOREMAT-NEXT:    sd a6, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v6, (a6)
-; NOREMAT-NEXT:    vle32.v v20, (a6)
-; NOREMAT-NEXT:    li a6, 19
-; NOREMAT-NEXT:    slli a6, a6, 9
-; NOREMAT-NEXT:    sd a6, 168(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li a7, 19
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v4, (a6)
-; NOREMAT-NEXT:    vle32.v v22, (a6)
-; NOREMAT-NEXT:    slli a5, a5, 11
-; NOREMAT-NEXT:    sd a5, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v2, (a5)
-; NOREMAT-NEXT:    vle32.v v12, (a5)
-; NOREMAT-NEXT:    li s10, 21
-; NOREMAT-NEXT:    slli a5, s10, 9
-; NOREMAT-NEXT:    sd a5, 152(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v24, (a5)
-; NOREMAT-NEXT:    vle32.v v14, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
-; NOREMAT-NEXT:    slli a5, s8, 10
-; NOREMAT-NEXT:    sd a5, 144(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v26, (a5)
+; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
+; NOREMAT-NEXT:    mv a7, a0
+; NOREMAT-NEXT:    li a0, 32
+; NOREMAT-NEXT:    addi a5, a7, 512
+; NOREMAT-NEXT:    addi a4, a7, 1024
+; NOREMAT-NEXT:    addi a6, a7, 1536
+; NOREMAT-NEXT:    li t4, 1
+; NOREMAT-NEXT:    li a2, 5
+; NOREMAT-NEXT:    li t1, 3
+; NOREMAT-NEXT:    li t0, 7
+; NOREMAT-NEXT:    lui t5, 1
+; NOREMAT-NEXT:    li s4, 9
+; NOREMAT-NEXT:    li s6, 11
+; NOREMAT-NEXT:    li s9, 13
+; NOREMAT-NEXT:    li ra, 15
+; NOREMAT-NEXT:    lui t2, 2
+; NOREMAT-NEXT:    lui s1, 3
+; NOREMAT-NEXT:    lui t3, 4
+; NOREMAT-NEXT:    lui s0, 5
+; NOREMAT-NEXT:    lui s3, 6
+; NOREMAT-NEXT:    lui s7, 7
+; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t4, t4, 11
+; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli a3, a2, 9
+; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t6, t1, 10
+; NOREMAT-NEXT:    slli s2, t0, 9
+; NOREMAT-NEXT:    add a0, a7, t5
+; NOREMAT-NEXT:    lui s11, 1
+; NOREMAT-NEXT:    slli s4, s4, 9
+; NOREMAT-NEXT:    slli s5, a2, 10
+; NOREMAT-NEXT:    slli s6, s6, 9
+; NOREMAT-NEXT:    slli s8, t1, 11
 ; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v28
-; NOREMAT-NEXT:    li s6, 23
-; NOREMAT-NEXT:    slli a5, s6, 9
-; NOREMAT-NEXT:    sd a5, 136(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v28, (a5)
-; NOREMAT-NEXT:    vle32.v v16, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
-; NOREMAT-NEXT:    lui a5, 3
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v30, (a5)
-; NOREMAT-NEXT:    vle32.v v10, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v6
-; NOREMAT-NEXT:    li s3, 25
-; NOREMAT-NEXT:    slli a5, s3, 9
-; NOREMAT-NEXT:    sd a5, 128(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v6, (a5)
-; NOREMAT-NEXT:    vle32.v v18, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v4
-; NOREMAT-NEXT:    slli a5, s2, 10
-; NOREMAT-NEXT:    sd a5, 120(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v4, (a5)
-; NOREMAT-NEXT:    vle32.v v20, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v2
-; NOREMAT-NEXT:    li t5, 27
-; NOREMAT-NEXT:    slli a5, t5, 9
-; NOREMAT-NEXT:    sd a5, 112(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v2, (a5)
-; NOREMAT-NEXT:    vle32.v v22, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v24
-; NOREMAT-NEXT:    slli a4, a4, 11
-; NOREMAT-NEXT:    sd a4, 104(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v24, (a4)
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v26
-; NOREMAT-NEXT:    li t2, 29
-; NOREMAT-NEXT:    slli a4, t2, 9
-; NOREMAT-NEXT:    sd a4, 96(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    slli s9, s9, 9
+; NOREMAT-NEXT:    li t5, 13
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    slli s10, t0, 10
+; NOREMAT-NEXT:    vle32.v v0, (a6)
+; NOREMAT-NEXT:    vle32.v v12, (a6)
+; NOREMAT-NEXT:    slli ra, ra, 9
+; NOREMAT-NEXT:    vle32.v v4, (a0)
+; NOREMAT-NEXT:    vle32.v v20, (a0)
+; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    vle32.v v30, (a4)
+; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
+; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    vle32.v v24, (a4)
+; NOREMAT-NEXT:    vle32.v v22, (a4)
+; NOREMAT-NEXT:    add a4, a7, s0
+; NOREMAT-NEXT:    vle32.v v14, (a7)
+; NOREMAT-NEXT:    vle32.v v18, (a4)
+; NOREMAT-NEXT:    vle32.v v16, (a4)
+; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v28
-; NOREMAT-NEXT:    slli a4, a2, 10
-; NOREMAT-NEXT:    sd a4, 88(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v28, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 2
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v30
-; NOREMAT-NEXT:    li a5, 31
-; NOREMAT-NEXT:    slli a4, a5, 9
-; NOREMAT-NEXT:    sd a4, 80(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
-; NOREMAT-NEXT:    lui a6, 4
-; NOREMAT-NEXT:    add a4, a0, a6
-; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    addi a0, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    add a4, a7, t4
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    add a4, a7, a3
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s7
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    add a4, a7, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 1
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
-; NOREMAT-NEXT:    addiw a4, a6, 512
-; NOREMAT-NEXT:    sd a4, 72(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s5
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
-; NOREMAT-NEXT:    vle32.v v18, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v2
-; NOREMAT-NEXT:    slli a4, t0, 10
-; NOREMAT-NEXT:    sd a4, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    add a4, a7, s6
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
-; NOREMAT-NEXT:    addiw a4, a6, 1536
-; NOREMAT-NEXT:    sd a4, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v0, (a4)
-; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v26
-; NOREMAT-NEXT:    slli a3, a3, 11
-; NOREMAT-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    vle32.v v20, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    lui t4, 8
+; NOREMAT-NEXT:    add a5, a7, t4
+; NOREMAT-NEXT:    vle32.v v20, (a5)
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
+; NOREMAT-NEXT:    li a4, 17
+; NOREMAT-NEXT:    slli a4, a4, 9
+; NOREMAT-NEXT:    li s1, 17
+; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
+; NOREMAT-NEXT:    li a5, 9
+; NOREMAT-NEXT:    slli a4, a5, 10
+; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    li a4, 19
+; NOREMAT-NEXT:    slli a4, a4, 9
+; NOREMAT-NEXT:    li t2, 19
+; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    vle32.v v30, (a4)
+; NOREMAT-NEXT:    slli a3, a2, 11
+; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
+; NOREMAT-NEXT:    vle32.v v4, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    li s7, 21
+; NOREMAT-NEXT:    slli a3, s7, 9
+; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    addi a3, sp, 288
-; NOREMAT-NEXT:    vs2r.v v8, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v28
-; NOREMAT-NEXT:    lui s1, 5
-; NOREMAT-NEXT:    addiw a3, s1, -1536
-; NOREMAT-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    vle32.v v24, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
-; NOREMAT-NEXT:    slli a3, a7, 10
-; NOREMAT-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v6
-; NOREMAT-NEXT:    addiw a3, s1, -512
-; NOREMAT-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
-; NOREMAT-NEXT:    vle32.v v16, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 1
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vl2r.v v26, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v4
-; NOREMAT-NEXT:    add a3, a0, s1
-; NOREMAT-NEXT:    vle32.v v26, (a3)
-; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vs2r.v v28, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; NOREMAT-NEXT:    addiw ra, s1, 512
-; NOREMAT-NEXT:    add a3, a0, ra
-; NOREMAT-NEXT:    vle32.v v28, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    li a6, 11
+; NOREMAT-NEXT:    slli a3, a6, 10
+; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
-; NOREMAT-NEXT:    slli s11, s10, 10
-; NOREMAT-NEXT:    add a3, a0, s11
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
+; NOREMAT-NEXT:    li s3, 23
+; NOREMAT-NEXT:    slli a3, s3, 9
+; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    vle32.v v18, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v12
-; NOREMAT-NEXT:    addiw s10, s1, 1536
-; NOREMAT-NEXT:    add a3, a0, s10
-; NOREMAT-NEXT:    vle32.v v2, (a3)
-; NOREMAT-NEXT:    vle32.v v20, (a3)
-; NOREMAT-NEXT:    addi a3, sp, 288
-; NOREMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v8
-; NOREMAT-NEXT:    slli s9, s8, 11
-; NOREMAT-NEXT:    add a3, a0, s9
-; NOREMAT-NEXT:    vle32.v v0, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
+; NOREMAT-NEXT:    li s0, 25
+; NOREMAT-NEXT:    slli a3, s0, 9
+; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v10
-; NOREMAT-NEXT:    lui t0, 6
-; NOREMAT-NEXT:    addiw s8, t0, -1536
-; NOREMAT-NEXT:    add a3, a0, s8
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    vle32.v v22, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; NOREMAT-NEXT:    slli s7, s6, 10
-; NOREMAT-NEXT:    add a3, a0, s7
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v26
-; NOREMAT-NEXT:    addiw s6, t0, -512
-; NOREMAT-NEXT:    add a3, a0, s6
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
-; NOREMAT-NEXT:    vle32.v v16, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vl2r.v v24, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v28
-; NOREMAT-NEXT:    add a3, a0, t0
-; NOREMAT-NEXT:    vle32.v v24, (a3)
-; NOREMAT-NEXT:    vle32.v v26, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vs2r.v v26, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v4
-; NOREMAT-NEXT:    addiw s5, t0, 512
-; NOREMAT-NEXT:    add a3, a0, s5
-; NOREMAT-NEXT:    vle32.v v26, (a3)
-; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 1
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vs2r.v v28, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; NOREMAT-NEXT:    slli s4, s3, 10
-; NOREMAT-NEXT:    add a3, a0, s4
-; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    vle32.v v18, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
-; NOREMAT-NEXT:    addiw s3, t0, 1536
-; NOREMAT-NEXT:    add a3, a0, s3
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    slli a3, t5, 10
+; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
-; NOREMAT-NEXT:    vle32.v v20, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v8
-; NOREMAT-NEXT:    slli s2, s2, 11
-; NOREMAT-NEXT:    add a3, a0, s2
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
+; NOREMAT-NEXT:    li t3, 27
+; NOREMAT-NEXT:    slli a3, t3, 9
+; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v10
-; NOREMAT-NEXT:    lui a3, 7
-; NOREMAT-NEXT:    addiw s0, a3, -1536
-; NOREMAT-NEXT:    add a4, a0, s0
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; NOREMAT-NEXT:    slli t6, t5, 10
-; NOREMAT-NEXT:    add a4, a0, t6
-; NOREMAT-NEXT:    vle32.v v0, (a4)
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    addiw t5, a3, -512
-; NOREMAT-NEXT:    add a4, a0, t5
-; NOREMAT-NEXT:    vle32.v v6, (a4)
-; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 2
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
-; NOREMAT-NEXT:    add a4, a0, a3
-; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 1
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vl2r.v v10, (a4) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
-; NOREMAT-NEXT:    addiw t4, a3, 512
-; NOREMAT-NEXT:    add a4, a0, t4
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    vle32.v v24, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v30
-; NOREMAT-NEXT:    slli t3, t2, 10
-; NOREMAT-NEXT:    add a4, a0, t3
-; NOREMAT-NEXT:    vle32.v v18, (a4)
-; NOREMAT-NEXT:    vle32.v v28, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v4
-; NOREMAT-NEXT:    addiw t2, a3, 1536
-; NOREMAT-NEXT:    add a4, a0, t2
-; NOREMAT-NEXT:    vle32.v v20, (a4)
-; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
-; NOREMAT-NEXT:    slli t1, a2, 11
-; NOREMAT-NEXT:    add a2, a0, t1
+; NOREMAT-NEXT:    slli a2, t0, 11
+; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    li t0, 29
+; NOREMAT-NEXT:    slli a2, t0, 9
+; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
+; NOREMAT-NEXT:    li a3, 15
+; NOREMAT-NEXT:    slli a2, a3, 10
+; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
+; NOREMAT-NEXT:    li t1, 31
+; NOREMAT-NEXT:    slli a2, t1, 9
+; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v0
-; NOREMAT-NEXT:    lui a2, 8
-; NOREMAT-NEXT:    addiw a7, a2, -1536
-; NOREMAT-NEXT:    add a4, a0, a7
-; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; NOREMAT-NEXT:    slli a6, a5, 10
-; NOREMAT-NEXT:    add a4, a0, a6
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    vle32.v v6, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v26
-; NOREMAT-NEXT:    addiw a5, a2, -512
-; NOREMAT-NEXT:    add a4, a0, a5
-; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a0, a0, a2
-; NOREMAT-NEXT:    vle32.v v0, (a0)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v18
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v20
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
+; NOREMAT-NEXT:    lui a4, 4
+; NOREMAT-NEXT:    addiw a0, a4, 512
+; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    vle32.v v8, (a0)
+; NOREMAT-NEXT:    vle32.v v26, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
+; NOREMAT-NEXT:    slli a2, s1, 10
+; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    addiw a2, a4, 1536
+; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    slli a2, a5, 11
+; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
+; NOREMAT-NEXT:    lui a5, 5
+; NOREMAT-NEXT:    addiw a2, a5, -1536
+; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
+; NOREMAT-NEXT:    slli a2, t2, 10
+; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t2, 19
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
+; NOREMAT-NEXT:    addiw a2, a5, -512
+; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
+; NOREMAT-NEXT:    addiw a2, a5, 512
+; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    slli a2, s7, 10
+; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
+; NOREMAT-NEXT:    addiw a2, a5, 1536
+; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    slli a2, a6, 11
+; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
+; NOREMAT-NEXT:    lui a6, 6
+; NOREMAT-NEXT:    addiw a2, a6, -1536
+; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    slli a2, s3, 10
+; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    addiw a2, a6, -512
+; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v14
+; NOREMAT-NEXT:    addiw a2, a6, 512
+; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a2, s0, 10
+; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
+; NOREMAT-NEXT:    addiw a2, a6, 1536
+; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    slli a2, t5, 11
+; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v0
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
+; NOREMAT-NEXT:    lui s0, 7
+; NOREMAT-NEXT:    addiw a2, s0, -1536
+; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    slli a2, t3, 10
+; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v14, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    addi a0, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a0) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
+; NOREMAT-NEXT:    addiw a2, s0, -512
+; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
+; NOREMAT-NEXT:    addiw a2, s0, 512
+; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui t3, 7
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a2, t0, 10
+; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
+; NOREMAT-NEXT:    addiw a2, t3, 1536
+; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
+; NOREMAT-NEXT:    addiw a2, t4, -1536
+; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v14, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    slli a2, t1, 10
+; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    addiw a0, t4, -512
+; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
+; NOREMAT-NEXT:    vle32.v v12, (a0)
+; NOREMAT-NEXT:    vle32.v v0, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v26
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v18
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v16
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v14
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v22
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v20
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    add s11, a1, s11
+; NOREMAT-NEXT:    sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 2
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 3
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a1, a4
+; NOREMAT-NEXT:    sd a4, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a5, a1, a5
+; NOREMAT-NEXT:    sd a5, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a6, a1, a6
+; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add t3, a1, t3
+; NOREMAT-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a1, t4
+; NOREMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, t4, 512
+; NOREMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, t4, 1024
+; NOREMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, t4, 1536
+; NOREMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli s1, s1, 11
+; NOREMAT-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 9
+; NOREMAT-NEXT:    addiw a2, a0, -1536
+; NOREMAT-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a2, a0, -1024
+; NOREMAT-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a2, a0, -512
+; NOREMAT-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    sd a2, 208(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw s11, a0, 512
+; NOREMAT-NEXT:    addiw s7, a0, 1024
+; NOREMAT-NEXT:    addiw s3, a0, 1536
+; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    lui a0, 10
+; NOREMAT-NEXT:    addiw t2, a0, -1536
+; NOREMAT-NEXT:    addiw a7, a0, -1024
+; NOREMAT-NEXT:    addiw a4, a0, -512
+; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, a0, 512
+; NOREMAT-NEXT:    ld a2, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    ld a3, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a3, a1, a3
+; NOREMAT-NEXT:    add a5, a1, t6
+; NOREMAT-NEXT:    add a6, a1, s2
+; NOREMAT-NEXT:    add t0, a1, s4
+; NOREMAT-NEXT:    add t1, a1, s5
+; NOREMAT-NEXT:    add t3, a1, s6
+; NOREMAT-NEXT:    add t4, a1, s8
+; NOREMAT-NEXT:    add t5, a1, s9
+; NOREMAT-NEXT:    add t6, a1, s10
+; NOREMAT-NEXT:    add s0, a1, ra
+; NOREMAT-NEXT:    ld s2, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s2, a1, s2
+; NOREMAT-NEXT:    ld s4, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s4, a1, s4
+; NOREMAT-NEXT:    ld s5, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s5, a1, s5
+; NOREMAT-NEXT:    ld s6, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s6, a1, s6
+; NOREMAT-NEXT:    ld s8, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s8, a1, s8
+; NOREMAT-NEXT:    ld s9, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s9, a1, s9
+; NOREMAT-NEXT:    ld s10, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s10, a1, s10
+; NOREMAT-NEXT:    ld ra, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 496(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 96(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 488(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 480(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 112(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 472(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 464(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 456(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 144(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 448(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 440(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 432(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 424(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 424(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 416(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 432(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 440(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 400(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 448(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 392(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 456(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 384(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 464(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 472(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 368(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 480(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 360(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 488(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 352(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 344(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 336(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 328(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 320(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 304(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 296(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 288(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 280(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 192(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 176(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 160(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 128(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add ra, a1, s11
+; NOREMAT-NEXT:    add s11, a1, s7
+; NOREMAT-NEXT:    add s7, a1, s3
+; NOREMAT-NEXT:    add s3, a1, s1
+; NOREMAT-NEXT:    add s1, a1, t2
+; NOREMAT-NEXT:    add t2, a1, a7
+; NOREMAT-NEXT:    add a7, a1, a4
+; NOREMAT-NEXT:    add a4, a1, a0
 ; NOREMAT-NEXT:    addi a0, a1, 1536
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a2)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a3)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a5)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a6)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t1)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t3)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t4)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t5)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t6)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (s0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s2)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s4)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 1
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s5)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s6)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (s8)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (s9)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (s10)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 2
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 3
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 424(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 432(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 440(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 4
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 448(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 456(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 464(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 472(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 480(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 488(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 496(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s1, a1, s1
-; NOREMAT-NEXT:    vse32.v v8, (s1)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    vse32.v v8, (ra)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    vse32.v v8, (s11)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s10, a1, s10
-; NOREMAT-NEXT:    vse32.v v8, (s10)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s9, a1, s9
-; NOREMAT-NEXT:    vse32.v v8, (s9)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s8, a1, s8
-; NOREMAT-NEXT:    vse32.v v8, (s8)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s7, a1, s7
-; NOREMAT-NEXT:    vse32.v v8, (s7)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s6, a1, s6
-; NOREMAT-NEXT:    vse32.v v8, (s6)
+; NOREMAT-NEXT:    ld a0, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t0, a1, t0
-; NOREMAT-NEXT:    vse32.v v8, (t0)
+; NOREMAT-NEXT:    ld a0, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s5, a1, s5
-; NOREMAT-NEXT:    vse32.v v8, (s5)
+; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    vse32.v v8, (s4)
+; NOREMAT-NEXT:    ld a0, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s3, a1, s3
-; NOREMAT-NEXT:    vse32.v v8, (s3)
+; NOREMAT-NEXT:    ld a0, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    vse32.v v8, (s2)
+; NOREMAT-NEXT:    ld a0, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s0, a1, s0
-; NOREMAT-NEXT:    vse32.v v8, (s0)
+; NOREMAT-NEXT:    ld a0, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t6, a1, t6
-; NOREMAT-NEXT:    vse32.v v8, (t6)
+; NOREMAT-NEXT:    ld a0, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t5, a1, t5
-; NOREMAT-NEXT:    vse32.v v8, (t5)
+; NOREMAT-NEXT:    ld a0, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a3, a1, a3
-; NOREMAT-NEXT:    vse32.v v8, (a3)
+; NOREMAT-NEXT:    ld a0, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t4, a1, t4
-; NOREMAT-NEXT:    vse32.v v8, (t4)
+; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t3, a1, t3
-; NOREMAT-NEXT:    vse32.v v8, (t3)
+; NOREMAT-NEXT:    ld a0, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t2, a1, t2
-; NOREMAT-NEXT:    vse32.v v8, (t2)
+; NOREMAT-NEXT:    ld a0, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t1, a1, t1
-; NOREMAT-NEXT:    vse32.v v8, (t1)
+; NOREMAT-NEXT:    ld a0, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a7, a1, a7
-; NOREMAT-NEXT:    vse32.v v8, (a7)
+; NOREMAT-NEXT:    ld a0, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a6, a1, a6
-; NOREMAT-NEXT:    vse32.v v8, (a6)
+; NOREMAT-NEXT:    ld a0, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a5, a1, a5
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a5)
-; NOREMAT-NEXT:    add a0, a1, a2
+; NOREMAT-NEXT:    ld a0, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 512
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
+; NOREMAT-NEXT:    ld a0, 624(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 1024
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    addiw a0, a2, 1536
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
+; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    li a0, 17
-; NOREMAT-NEXT:    slli a0, a0, 11
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 9
-; NOREMAT-NEXT:    addiw a2, a0, -1536
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -1024
-; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    vse32.v v8, (ra)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -512
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (s11)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, 512
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, 1024
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    vse32.v v10, (a2)
+; NOREMAT-NEXT:    vse32.v v8, (s7)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    addiw a0, a0, 1536
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    li a0, 19
-; NOREMAT-NEXT:    slli a0, a0, 11
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (s3)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 10
-; NOREMAT-NEXT:    addiw a2, a0, -1536
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -1024
-; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    vse32.v v8, (s1)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -512
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    add a2, a1, a0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a0, a0, 512
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (t2)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a7)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a4)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    csrr a0, vlenb
-; NOREMAT-NEXT:    li a1, 6
-; NOREMAT-NEXT:    mul a0, a0, a1
+; NOREMAT-NEXT:    slli a0, a0, 1
 ; NOREMAT-NEXT:    add sp, sp, a0
-; NOREMAT-NEXT:    .cfi_def_cfa sp, 400
-; NOREMAT-NEXT:    ld ra, 392(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s0, 384(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s1, 376(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s2, 368(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s3, 360(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s4, 352(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s5, 344(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s6, 336(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s7, 328(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s8, 320(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s9, 312(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s10, 304(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s11, 296(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    .cfi_def_cfa sp, 752
+; NOREMAT-NEXT:    ld ra, 744(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s0, 736(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s1, 728(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s2, 720(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s3, 712(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 704(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 696(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 688(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s7, 680(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 672(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 664(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s10, 656(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s11, 648(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    .cfi_restore ra
 ; NOREMAT-NEXT:    .cfi_restore s0
 ; NOREMAT-NEXT:    .cfi_restore s1
@@ -790,27 +888,27 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_restore s9
 ; NOREMAT-NEXT:    .cfi_restore s10
 ; NOREMAT-NEXT:    .cfi_restore s11
-; NOREMAT-NEXT:    addi sp, sp, 400
+; NOREMAT-NEXT:    addi sp, sp, 752
 ; NOREMAT-NEXT:    .cfi_def_cfa_offset 0
 ; NOREMAT-NEXT:    ret
 ;
 ; REMAT-LABEL: test:
 ; REMAT:       # %bb.0:
-; REMAT-NEXT:    addi sp, sp, -112
-; REMAT-NEXT:    .cfi_def_cfa_offset 112
-; REMAT-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    addi sp, sp, -544
+; REMAT-NEXT:    .cfi_def_cfa_offset 544
+; REMAT-NEXT:    sd ra, 536(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s0, 528(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s1, 520(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s2, 512(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s3, 504(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s4, 496(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s5, 488(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s6, 480(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s7, 472(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s8, 464(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s9, 456(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s10, 448(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s11, 440(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    .cfi_offset ra, -8
 ; REMAT-NEXT:    .cfi_offset s0, -16
 ; REMAT-NEXT:    .cfi_offset s1, -24
@@ -824,730 +922,980 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_offset s9, -88
 ; REMAT-NEXT:    .cfi_offset s10, -96
 ; REMAT-NEXT:    .cfi_offset s11, -104
-; REMAT-NEXT:    li a2, 32
-; REMAT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; REMAT-NEXT:    vle32.v v8, (a0)
-; REMAT-NEXT:    addi a2, a0, 512
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    addi a2, a0, 1024
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 18
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    sub sp, sp, a2
+; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb
+; REMAT-NEXT:    li a4, 32
+; REMAT-NEXT:    addi a5, a0, 512
+; REMAT-NEXT:    addi a3, a0, 1024
 ; REMAT-NEXT:    addi a2, a0, 1536
+; REMAT-NEXT:    li a6, 1
+; REMAT-NEXT:    slli a6, a6, 11
+; REMAT-NEXT:    li a7, 5
+; REMAT-NEXT:    slli a7, a7, 9
+; REMAT-NEXT:    li t0, 3
+; REMAT-NEXT:    slli t0, t0, 10
+; REMAT-NEXT:    li t1, 7
+; REMAT-NEXT:    slli t1, t1, 9
+; REMAT-NEXT:    lui t2, 1
+; REMAT-NEXT:    li t3, 9
+; REMAT-NEXT:    slli t3, t3, 9
+; REMAT-NEXT:    li t4, 5
+; REMAT-NEXT:    slli t4, t4, 10
+; REMAT-NEXT:    li t5, 11
+; REMAT-NEXT:    slli t5, t5, 9
+; REMAT-NEXT:    li t6, 3
+; REMAT-NEXT:    slli t6, t6, 11
+; REMAT-NEXT:    li s0, 13
+; REMAT-NEXT:    slli s0, s0, 9
+; REMAT-NEXT:    li s1, 7
+; REMAT-NEXT:    slli s1, s1, 10
+; REMAT-NEXT:    li s2, 15
+; REMAT-NEXT:    slli s2, s2, 9
+; REMAT-NEXT:    lui s3, 2
+; REMAT-NEXT:    li s4, 17
+; REMAT-NEXT:    slli s4, s4, 9
+; REMAT-NEXT:    li s5, 9
+; REMAT-NEXT:    slli s5, s5, 10
+; REMAT-NEXT:    li s6, 19
+; REMAT-NEXT:    slli s6, s6, 9
+; REMAT-NEXT:    li s7, 5
+; REMAT-NEXT:    slli s7, s7, 11
+; REMAT-NEXT:    li s8, 21
+; REMAT-NEXT:    slli s8, s8, 9
+; REMAT-NEXT:    li s9, 11
+; REMAT-NEXT:    slli s9, s9, 10
+; REMAT-NEXT:    li s10, 23
+; REMAT-NEXT:    slli s10, s10, 9
+; REMAT-NEXT:    lui s11, 3
+; REMAT-NEXT:    li ra, 25
+; REMAT-NEXT:    slli ra, ra, 9
+; REMAT-NEXT:    vsetvli zero, a4, e32, m2, ta, ma
+; REMAT-NEXT:    vle32.v v8, (a5)
+; REMAT-NEXT:    li a4, 13
+; REMAT-NEXT:    slli a4, a4, 10
+; REMAT-NEXT:    vle32.v v10, (a3)
+; REMAT-NEXT:    vle32.v v12, (a3)
+; REMAT-NEXT:    li a3, 27
+; REMAT-NEXT:    slli a3, a3, 9
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 4
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    vle32.v v4, (a0)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a5, 14
+; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, t3
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    add a2, a0, t4
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 1
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 5
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v14
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    add a2, a0, t5
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 3
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 7
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 1
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a5, 12
+; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    add a2, a0, s0
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    add a2, a0, s1
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v30
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    add a2, a0, s2
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 5
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    slli a5, a5, 4
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v12, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    add a2, a0, s3
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    li a6, 14
+; REMAT-NEXT:    mul a5, a5, a6
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v16, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    add a2, a0, s5
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 3
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 13
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v14
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    add a2, a0, s6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 7
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    li a6, 12
+; REMAT-NEXT:    mul a5, a5, a6
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v0, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    add a2, a0, s8
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    add a2, a0, s9
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    add a2, a0, s10
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 15
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 2
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 17
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v12, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, s11
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 19
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v16
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    add a2, a0, ra
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 5
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 21
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 1
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, a4
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li a2, 23
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    lui a2, 3
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    li a2, 25
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li a2, 13
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li a2, 27
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 7
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v26
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 2
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 4
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li a5, 7
+; REMAT-NEXT:    slli a5, a5, 11
+; REMAT-NEXT:    add a2, a0, a5
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    addi a3, sp, 432
+; REMAT-NEXT:    vs2r.v v18, (a3) # Unknown-size Folded Spill
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 14
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a2, 29
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v28
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v24
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 12
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v20, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a2, 15
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v30
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 10
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a2, 31
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v6
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v12
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 6
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 1
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    li a2, 17
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 2
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    vle32.v v22, (a2)
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    addi a3, sp, 432
+; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    li a2, 9
 ; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 14
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v18
+; REMAT-NEXT:    vle32.v v10, (a2)
 ; REMAT-NEXT:    lui a2, 5
 ; REMAT-NEXT:    addiw a2, a2, -1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 12
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li a2, 19
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 10
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v14, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; REMAT-NEXT:    lui ra, 5
-; REMAT-NEXT:    addiw ra, ra, -512
-; REMAT-NEXT:    add a2, a0, ra
-; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v16, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
-; REMAT-NEXT:    lui s11, 5
-; REMAT-NEXT:    add a2, a0, s11
-; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 6
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v18, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; REMAT-NEXT:    lui s10, 5
-; REMAT-NEXT:    addiw s10, s10, 512
-; REMAT-NEXT:    add a2, a0, s10
-; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v24
-; REMAT-NEXT:    li s9, 21
-; REMAT-NEXT:    slli s9, s9, 10
-; REMAT-NEXT:    add a2, a0, s9
-; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
-; REMAT-NEXT:    lui s8, 5
-; REMAT-NEXT:    addiw s8, s8, 1536
-; REMAT-NEXT:    add a2, a0, s8
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v28
-; REMAT-NEXT:    li s7, 11
-; REMAT-NEXT:    slli s7, s7, 11
+; REMAT-NEXT:    li s7, 21
+; REMAT-NEXT:    slli s7, s7, 10
 ; REMAT-NEXT:    add a2, a0, s7
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    lui s4, 5
+; REMAT-NEXT:    addiw s4, s4, 1536
+; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
-; REMAT-NEXT:    lui s6, 6
-; REMAT-NEXT:    addiw s6, s6, -1536
-; REMAT-NEXT:    add a2, a0, s6
-; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    li a2, 11
+; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
-; REMAT-NEXT:    li s5, 23
-; REMAT-NEXT:    slli s5, s5, 10
-; REMAT-NEXT:    add a2, a0, s5
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v4
-; REMAT-NEXT:    lui s4, 6
-; REMAT-NEXT:    addiw s4, s4, -512
-; REMAT-NEXT:    add a2, a0, s4
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
 ; REMAT-NEXT:    lui s3, 6
+; REMAT-NEXT:    addiw s3, s3, -1536
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v24
-; REMAT-NEXT:    lui s2, 6
-; REMAT-NEXT:    addiw s2, s2, 512
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    li s2, 23
+; REMAT-NEXT:    slli s2, s2, 10
 ; REMAT-NEXT:    add a2, a0, s2
-; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s1, 6
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v26
-; REMAT-NEXT:    li s1, 25
-; REMAT-NEXT:    slli s1, s1, 10
-; REMAT-NEXT:    add a2, a0, s1
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v28
 ; REMAT-NEXT:    lui s0, 6
-; REMAT-NEXT:    addiw s0, s0, 1536
+; REMAT-NEXT:    addiw s0, s0, 512
 ; REMAT-NEXT:    add a2, a0, s0
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    li a2, 25
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v30
-; REMAT-NEXT:    li t6, 13
-; REMAT-NEXT:    slli t6, t6, 11
+; REMAT-NEXT:    lui t6, 6
+; REMAT-NEXT:    addiw t6, t6, 1536
 ; REMAT-NEXT:    add a2, a0, t6
-; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
-; REMAT-NEXT:    lui t5, 7
-; REMAT-NEXT:    addiw t5, t5, -1536
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    li t5, 13
+; REMAT-NEXT:    slli t5, t5, 11
 ; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li t4, 27
 ; REMAT-NEXT:    slli t4, t4, 10
 ; REMAT-NEXT:    add a2, a0, t4
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v2
-; REMAT-NEXT:    lui t3, 7
-; REMAT-NEXT:    addiw t3, t3, -512
-; REMAT-NEXT:    add a2, a0, t3
-; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v0
-; REMAT-NEXT:    lui t2, 7
-; REMAT-NEXT:    add a2, a0, t2
-; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v26
-; REMAT-NEXT:    lui t1, 7
-; REMAT-NEXT:    addiw t1, t1, 512
-; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t3, 7
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v28
-; REMAT-NEXT:    li t0, 29
-; REMAT-NEXT:    slli t0, t0, 10
-; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    lui t2, 7
+; REMAT-NEXT:    addiw t2, t2, 512
+; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v30
-; REMAT-NEXT:    lui a7, 7
-; REMAT-NEXT:    addiw a7, a7, 1536
-; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    li t1, 29
+; REMAT-NEXT:    slli t1, t1, 10
+; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v6
-; REMAT-NEXT:    li a6, 15
-; REMAT-NEXT:    slli a6, a6, 11
-; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    lui t0, 7
+; REMAT-NEXT:    addiw t0, t0, 1536
+; REMAT-NEXT:    add a2, a0, t0
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v4
-; REMAT-NEXT:    lui a5, 8
-; REMAT-NEXT:    addiw a5, a5, -1536
-; REMAT-NEXT:    add a2, a0, a5
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    li a7, 15
+; REMAT-NEXT:    slli a7, a7, 11
+; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    lui a6, 8
+; REMAT-NEXT:    addiw a6, a6, -1536
+; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li a4, 31
 ; REMAT-NEXT:    slli a4, a4, 10
 ; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v0
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    lui a3, 8
 ; REMAT-NEXT:    addiw a3, a3, -512
 ; REMAT-NEXT:    add a2, a0, a3
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    lui a2, 8
 ; REMAT-NEXT:    add a0, a0, a2
-; REMAT-NEXT:    vle32.v v0, (a0)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v16
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v20
-; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v22
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v24
-; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v10
-; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
-; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
+; REMAT-NEXT:    vle32.v v4, (a0)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    addi a0, a1, 1024
 ; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    addi a0, a1, 1536
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 1
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 416(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 408(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 3
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 400(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 392(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 1
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 384(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 5
-; REMAT-NEXT:    slli a0, a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 11
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 3
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 13
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 7
-; REMAT-NEXT:    slli a0, a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 15
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    lui a0, 2
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 9
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 376(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 19
+; REMAT-NEXT:    sd a0, 368(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 5
+; REMAT-NEXT:    sd a0, 360(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 3
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 21
+; REMAT-NEXT:    sd a0, 352(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 11
+; REMAT-NEXT:    sd a0, 344(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 23
+; REMAT-NEXT:    sd a0, 336(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 15
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    lui a0, 3
+; REMAT-NEXT:    sd a0, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 2
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 25
+; REMAT-NEXT:    sd a0, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 17
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s5, a1, s5
+; REMAT-NEXT:    sd s5, 304(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s6, a1, s6
+; REMAT-NEXT:    sd s6, 296(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 5
+; REMAT-NEXT:    slli a0, a0, 11
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s8, a1, s8
+; REMAT-NEXT:    sd s8, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s9, a1, s9
+; REMAT-NEXT:    sd s9, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s10, a1, s10
+; REMAT-NEXT:    sd s10, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s11, a1, s11
+; REMAT-NEXT:    sd s11, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add ra, a1, ra
+; REMAT-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 27
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 7
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 232(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add a5, a1, a5
+; REMAT-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 29
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 15
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 31
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    addiw a0, a0, 512
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 17
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    addiw a0, a0, 1536
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 5
 ; REMAT-NEXT:    addiw a0, a0, -1536
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 19
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, -512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 128(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, 512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s7, a1, s7
+; REMAT-NEXT:    sd s7, 112(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 11
+; REMAT-NEXT:    slli a0, a0, 11
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s3, a1, s3
+; REMAT-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 6
+; REMAT-NEXT:    addiw a0, a0, -512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s1, a1, s1
+; REMAT-NEXT:    sd s1, 64(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s0, a1, s0
+; REMAT-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 25
+; REMAT-NEXT:    slli a0, a0, 10
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add t6, a1, t6
+; REMAT-NEXT:    sd t6, 40(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add t5, a1, t5
+; REMAT-NEXT:    sd t5, 32(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 7
+; REMAT-NEXT:    addiw a0, a0, -1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add t4, a1, t4
+; REMAT-NEXT:    sd t4, 16(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui ra, 7
+; REMAT-NEXT:    addiw ra, ra, -512
 ; REMAT-NEXT:    add ra, a1, ra
+; REMAT-NEXT:    add s11, a1, t3
+; REMAT-NEXT:    add s10, a1, t2
+; REMAT-NEXT:    add s9, a1, t1
+; REMAT-NEXT:    add s8, a1, t0
+; REMAT-NEXT:    add s7, a1, a7
+; REMAT-NEXT:    add s6, a1, a6
+; REMAT-NEXT:    add s5, a1, a4
+; REMAT-NEXT:    add s4, a1, a3
+; REMAT-NEXT:    add s3, a1, a2
+; REMAT-NEXT:    lui s2, 8
+; REMAT-NEXT:    addiw s2, s2, 512
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    lui s1, 8
+; REMAT-NEXT:    addiw s1, s1, 1024
+; REMAT-NEXT:    add s1, a1, s1
+; REMAT-NEXT:    lui s0, 8
+; REMAT-NEXT:    addiw s0, s0, 1536
+; REMAT-NEXT:    add s0, a1, s0
+; REMAT-NEXT:    li t6, 17
+; REMAT-NEXT:    slli t6, t6, 11
+; REMAT-NEXT:    add t6, a1, t6
+; REMAT-NEXT:    lui t5, 9
+; REMAT-NEXT:    addiw t5, t5, -1536
+; REMAT-NEXT:    add t5, a1, t5
+; REMAT-NEXT:    lui t4, 9
+; REMAT-NEXT:    addiw t4, t4, -1024
+; REMAT-NEXT:    add t4, a1, t4
+; REMAT-NEXT:    lui t3, 9
+; REMAT-NEXT:    addiw t3, t3, -512
+; REMAT-NEXT:    add t3, a1, t3
+; REMAT-NEXT:    lui t2, 9
+; REMAT-NEXT:    add t2, a1, t2
+; REMAT-NEXT:    lui t1, 9
+; REMAT-NEXT:    addiw t1, t1, 512
+; REMAT-NEXT:    add t1, a1, t1
+; REMAT-NEXT:    lui t0, 9
+; REMAT-NEXT:    addiw t0, t0, 1024
+; REMAT-NEXT:    add t0, a1, t0
+; REMAT-NEXT:    lui a7, 9
+; REMAT-NEXT:    addiw a7, a7, 1536
+; REMAT-NEXT:    add a7, a1, a7
+; REMAT-NEXT:    li a6, 19
+; REMAT-NEXT:    slli a6, a6, 11
+; REMAT-NEXT:    add a6, a1, a6
+; REMAT-NEXT:    lui a5, 10
+; REMAT-NEXT:    addiw a5, a5, -1536
+; REMAT-NEXT:    add a5, a1, a5
+; REMAT-NEXT:    lui a4, 10
+; REMAT-NEXT:    addiw a4, a4, -1024
+; REMAT-NEXT:    add a4, a1, a4
+; REMAT-NEXT:    lui a3, 10
+; REMAT-NEXT:    addiw a3, a3, -512
+; REMAT-NEXT:    add a3, a1, a3
+; REMAT-NEXT:    lui a2, 10
+; REMAT-NEXT:    add a2, a1, a2
+; REMAT-NEXT:    lui a0, 10
+; REMAT-NEXT:    addiw a0, a0, 512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    addi a1, a1, 1536
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 416(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 408(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 400(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 392(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 384(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 376(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 368(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 360(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 352(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 344(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 336(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 328(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 320(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 312(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 304(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 296(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 288(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 280(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 272(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 264(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 256(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 248(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 240(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 232(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 224(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 216(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 208(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 200(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 192(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 184(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 176(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 168(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 160(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 152(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 144(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 136(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 128(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 88(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 80(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 72(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 64(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 56(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 48(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 40(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    vse32.v v8, (ra)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s11, a1, s11
 ; REMAT-NEXT:    vse32.v v8, (s11)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s10, a1, s10
 ; REMAT-NEXT:    vse32.v v8, (s10)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s9, a1, s9
 ; REMAT-NEXT:    vse32.v v8, (s9)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s8, a1, s8
 ; REMAT-NEXT:    vse32.v v8, (s8)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s7, a1, s7
 ; REMAT-NEXT:    vse32.v v8, (s7)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s6, a1, s6
 ; REMAT-NEXT:    vse32.v v8, (s6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s5, a1, s5
 ; REMAT-NEXT:    vse32.v v8, (s5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s4, a1, s4
 ; REMAT-NEXT:    vse32.v v8, (s4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s3, a1, s3
 ; REMAT-NEXT:    vse32.v v8, (s3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s2, a1, s2
 ; REMAT-NEXT:    vse32.v v8, (s2)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s1, a1, s1
 ; REMAT-NEXT:    vse32.v v8, (s1)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s0, a1, s0
 ; REMAT-NEXT:    vse32.v v8, (s0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t6, a1, t6
 ; REMAT-NEXT:    vse32.v v8, (t6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t5, a1, t5
 ; REMAT-NEXT:    vse32.v v8, (t5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t4, a1, t4
 ; REMAT-NEXT:    vse32.v v8, (t4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t3, a1, t3
 ; REMAT-NEXT:    vse32.v v8, (t3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t2, a1, t2
 ; REMAT-NEXT:    vse32.v v8, (t2)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t1, a1, t1
 ; REMAT-NEXT:    vse32.v v8, (t1)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t0, a1, t0
 ; REMAT-NEXT:    vse32.v v8, (t0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a7, a1, a7
 ; REMAT-NEXT:    vse32.v v8, (a7)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a6, a1, a6
 ; REMAT-NEXT:    vse32.v v8, (a6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a5, a1, a5
 ; REMAT-NEXT:    vse32.v v8, (a5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a4, a1, a4
 ; REMAT-NEXT:    vse32.v v8, (a4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a3, a1, a3
 ; REMAT-NEXT:    vse32.v v8, (a3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a2, a1, a2
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a2)
-; REMAT-NEXT:    lui a0, 8
-; REMAT-NEXT:    addiw a0, a0, 512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 8
-; REMAT-NEXT:    addiw a0, a0, 1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 8
-; REMAT-NEXT:    addiw a0, a0, 1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, -1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, -1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, -512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, 512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, 1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, 1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    li a0, 19
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, -1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, -1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, -512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, 512
-; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    csrr a0, vlenb
+; REMAT-NEXT:    li a1, 18
+; REMAT-NEXT:    mul a0, a0, a1
+; REMAT-NEXT:    add sp, sp, a0
+; REMAT-NEXT:    .cfi_def_cfa sp, 544
+; REMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s0, 528(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s1, 520(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s2, 512(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s3, 504(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s4, 496(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s5, 488(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s6, 480(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s7, 472(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s8, 464(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s9, 456(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s10, 448(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s11, 440(sp) # 8-byte Folded Reload
 ; REMAT-NEXT:    .cfi_restore ra
 ; REMAT-NEXT:    .cfi_restore s0
 ; REMAT-NEXT:    .cfi_restore s1
@@ -1561,7 +1909,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_restore s9
 ; REMAT-NEXT:    .cfi_restore s10
 ; REMAT-NEXT:    .cfi_restore s11
-; REMAT-NEXT:    addi sp, sp, 112
+; REMAT-NEXT:    addi sp, sp, 544
 ; REMAT-NEXT:    .cfi_def_cfa_offset 0
 ; REMAT-NEXT:    ret
   %4 = tail call i64 @llvm.riscv.vsetvli.i64(i64 32, i64 2, i64 1)
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index f9b9c8a69d431c..b1bba5fdc92116 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -8,13 +8,13 @@ define i1 @pr84653(i32 %x) {
 ; CHECK-NOZBB-LABEL: pr84653:
 ; CHECK-NOZBB:       # %bb.0:
 ; CHECK-NOZBB-NEXT:    sext.w a1, a0
-; CHECK-NOZBB-NEXT:    sgtz a2, a1
-; CHECK-NOZBB-NEXT:    lui a3, 524288
-; CHECK-NOZBB-NEXT:    addi a3, a3, -1
-; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    lui a2, 524288
+; CHECK-NOZBB-NEXT:    sgtz a3, a1
+; CHECK-NOZBB-NEXT:    addi a2, a2, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a2
 ; CHECK-NOZBB-NEXT:    sext.w a0, a0
 ; CHECK-NOZBB-NEXT:    slt a0, a0, a1
-; CHECK-NOZBB-NEXT:    and a0, a2, a0
+; CHECK-NOZBB-NEXT:    and a0, a3, a0
 ; CHECK-NOZBB-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: pr84653:
@@ -69,13 +69,13 @@ define i1 @select_to_or(i32 %x) {
 ; CHECK-NOZBB-LABEL: select_to_or:
 ; CHECK-NOZBB:       # %bb.0:
 ; CHECK-NOZBB-NEXT:    sext.w a1, a0
-; CHECK-NOZBB-NEXT:    sgtz a2, a1
-; CHECK-NOZBB-NEXT:    lui a3, 524288
-; CHECK-NOZBB-NEXT:    addi a3, a3, -1
-; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    lui a2, 524288
+; CHECK-NOZBB-NEXT:    sgtz a3, a1
+; CHECK-NOZBB-NEXT:    addi a2, a2, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a2
 ; CHECK-NOZBB-NEXT:    sext.w a0, a0
 ; CHECK-NOZBB-NEXT:    slt a0, a0, a1
-; CHECK-NOZBB-NEXT:    or a0, a2, a0
+; CHECK-NOZBB-NEXT:    or a0, a3, a0
 ; CHECK-NOZBB-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: select_to_or:
diff --git a/llvm/test/CodeGen/RISCV/pr95271.ll b/llvm/test/CodeGen/RISCV/pr95271.ll
index 950e6fb5f37ec2..aa941cb8036276 100644
--- a/llvm/test/CodeGen/RISCV/pr95271.ll
+++ b/llvm/test/CodeGen/RISCV/pr95271.ll
@@ -6,22 +6,22 @@ define i32 @PR95271(ptr %p) {
 ; RV32I-LABEL: PR95271:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    addi a0, a0, 1
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -33,23 +33,23 @@ define i32 @PR95271(ptr %p) {
 ; RV64I-LABEL: PR95271:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    addiw a1, a0, 1
-; RV64I-NEXT:    addi a0, a0, 1
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a2, a0, 1
+; RV64I-NEXT:    srli a2, a2, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a0, a0, 1
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sub a1, a1, a0
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    and a2, a1, a0
-; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
index dd270fa12183ed..fb0c11e930b3b6 100644
--- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
+++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
@@ -24,6 +24,8 @@ define void @last_chance_recoloring_failure() {
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb
 ; CHECK-NEXT:    li a0, 55
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmclr.m v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vloxseg2ei32.v v16, (a1), v8
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -35,8 +37,6 @@ define void @last_chance_recoloring_failure() {
 ; CHECK-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs4r.v v20, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmclr.m v0
 ; CHECK-NEXT:    li s0, 36
 ; CHECK-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwadd.vv v16, v8, v12, v0.t
@@ -84,6 +84,8 @@ define void @last_chance_recoloring_failure() {
 ; SUBREGLIVENESS-NEXT:    sub sp, sp, a0
 ; SUBREGLIVENESS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb
 ; SUBREGLIVENESS-NEXT:    li a0, 55
+; SUBREGLIVENESS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; SUBREGLIVENESS-NEXT:    vmclr.m v0
 ; SUBREGLIVENESS-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; SUBREGLIVENESS-NEXT:    vloxseg2ei32.v v16, (a1), v8
 ; SUBREGLIVENESS-NEXT:    csrr a0, vlenb
@@ -95,8 +97,6 @@ define void @last_chance_recoloring_failure() {
 ; SUBREGLIVENESS-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; SUBREGLIVENESS-NEXT:    add a0, a0, a1
 ; SUBREGLIVENESS-NEXT:    vs4r.v v20, (a0) # Unknown-size Folded Spill
-; SUBREGLIVENESS-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; SUBREGLIVENESS-NEXT:    vmclr.m v0
 ; SUBREGLIVENESS-NEXT:    li s0, 36
 ; SUBREGLIVENESS-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
 ; SUBREGLIVENESS-NEXT:    vfwadd.vv v16, v8, v12, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rem.ll b/llvm/test/CodeGen/RISCV/rem.ll
index 5b27c4129df6ad..2001262008237f 100644
--- a/llvm/test/CodeGen/RISCV/rem.ll
+++ b/llvm/test/CodeGen/RISCV/rem.ll
@@ -23,8 +23,8 @@ define i32 @urem(i32 %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -452,8 +452,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -463,8 +463,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV32IM-LABEL: srem8:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    rem a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -474,8 +474,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -485,8 +485,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV64IM-LABEL: srem8:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    remw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -637,8 +637,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -648,8 +648,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV32IM-LABEL: srem16:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    rem a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -659,8 +659,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -670,8 +670,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV64IM-LABEL: srem16:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    remw a0, a0, a1
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index 352184c2d85ada..32261ee47164e5 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -64,11 +64,11 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lw a5, -4(a4)
 ; CHECK-NEXT:    lw a6, 0(a4)
+; CHECK-NEXT:    addi a3, a3, 2
 ; CHECK-NEXT:    addi a5, a5, 4
 ; CHECK-NEXT:    addi a6, a6, 4
 ; CHECK-NEXT:    sw a5, -4(a4)
 ; CHECK-NEXT:    sw a6, 0(a4)
-; CHECK-NEXT:    addi a3, a3, 2
 ; CHECK-NEXT:    addi a4, a4, 8
 ; CHECK-NEXT:    bne a1, a3, .LBB1_4
 ; CHECK-NEXT:  .LBB1_5: # %for.cond.cleanup.loopexit.unr-lcssa
diff --git a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
index 4901e268ec11a0..c1e7b682200eb1 100644
--- a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
@@ -6,13 +6,13 @@ define void @test(ptr nocapture noundef writeonly %array1, i32 noundef signext %
 ; RV64-LABEL: test:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addiw a3, a1, 5
-; RV64-NEXT:    slli a4, a3, 2
-; RV64-NEXT:    add a4, a0, a4
 ; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 0(a4)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sw a3, 140(a0)
+; RV64-NEXT:    slli a4, a3, 2
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sw a2, 24(a1)
+; RV64-NEXT:    sw a3, 140(a1)
 ; RV64-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
@@ -70,13 +70,13 @@ define void @test2(ptr nocapture noundef writeonly %array1, i64 noundef %a, i64
 ; RV64-LABEL: test2:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a4, a0, a4
 ; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sd a2, 0(a4)
-; RV64-NEXT:    sd a2, 48(a0)
-; RV64-NEXT:    sd a3, 280(a0)
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sd a2, 48(a1)
+; RV64-NEXT:    sd a3, 280(a1)
 ; RV64-NEXT:    ret
 entry:
   %add = add nsw i64 %a, 5
@@ -101,8 +101,8 @@ define void @test3(ptr nocapture noundef %array1, i64 noundef %a, i64 noundef %b
 ; RV64-NEXT:    mv a5, a2
 ; RV64-NEXT:  .LBB3_2: # %entry
 ; RV64-NEXT:    slli a2, a4, 3
-; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    sd a5, 0(a2)
 ; RV64-NEXT:    sd a5, 48(a0)
diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index d907a37c2b3d17..634cca5dcdb71b 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -133,10 +133,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:  .LBB2_3:
 ; RV32I-NEXT:    srai a5, a5, 31
-; RV32I-NEXT:    and a4, a5, a4
 ; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    sub a5, a5, a2
+; RV32I-NEXT:    li a6, 32
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    sub a5, a6, a2
 ; RV32I-NEXT:    srl a6, a1, a7
 ; RV32I-NEXT:    bltz a5, .LBB2_5
 ; RV32I-NEXT:  # %bb.4:
@@ -181,10 +181,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a6
 ; RV32ZBB-NEXT:  .LBB2_3:
 ; RV32ZBB-NEXT:    srai a5, a5, 31
-; RV32ZBB-NEXT:    and a4, a5, a4
 ; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    li a5, 32
-; RV32ZBB-NEXT:    sub a5, a5, a2
+; RV32ZBB-NEXT:    li a6, 32
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a6, a2
 ; RV32ZBB-NEXT:    srl a6, a1, a7
 ; RV32ZBB-NEXT:    bltz a5, .LBB2_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -226,10 +226,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a6
 ; RV32XTHEADBB-NEXT:  .LBB2_3:
 ; RV32XTHEADBB-NEXT:    srai a5, a5, 31
-; RV32XTHEADBB-NEXT:    and a4, a5, a4
 ; RV32XTHEADBB-NEXT:    neg a7, a2
-; RV32XTHEADBB-NEXT:    li a5, 32
-; RV32XTHEADBB-NEXT:    sub a5, a5, a2
+; RV32XTHEADBB-NEXT:    li a6, 32
+; RV32XTHEADBB-NEXT:    and a4, a5, a4
+; RV32XTHEADBB-NEXT:    sub a5, a6, a2
 ; RV32XTHEADBB-NEXT:    srl a6, a1, a7
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB2_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -281,10 +281,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:  .LBB3_3:
 ; RV32I-NEXT:    srai a5, a5, 31
-; RV32I-NEXT:    and a4, a5, a4
 ; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    sub a5, a5, a2
+; RV32I-NEXT:    li a6, 32
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    sub a5, a6, a2
 ; RV32I-NEXT:    sll a6, a0, a7
 ; RV32I-NEXT:    bltz a5, .LBB3_5
 ; RV32I-NEXT:  # %bb.4:
@@ -329,10 +329,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a6
 ; RV32ZBB-NEXT:  .LBB3_3:
 ; RV32ZBB-NEXT:    srai a5, a5, 31
-; RV32ZBB-NEXT:    and a4, a5, a4
 ; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    li a5, 32
-; RV32ZBB-NEXT:    sub a5, a5, a2
+; RV32ZBB-NEXT:    li a6, 32
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a6, a2
 ; RV32ZBB-NEXT:    sll a6, a0, a7
 ; RV32ZBB-NEXT:    bltz a5, .LBB3_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -374,10 +374,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a6
 ; RV32XTHEADBB-NEXT:  .LBB3_3:
 ; RV32XTHEADBB-NEXT:    srai a5, a5, 31
-; RV32XTHEADBB-NEXT:    and a4, a5, a4
 ; RV32XTHEADBB-NEXT:    neg a7, a2
-; RV32XTHEADBB-NEXT:    li a5, 32
-; RV32XTHEADBB-NEXT:    sub a5, a5, a2
+; RV32XTHEADBB-NEXT:    li a6, 32
+; RV32XTHEADBB-NEXT:    and a4, a5, a4
+; RV32XTHEADBB-NEXT:    sub a5, a6, a2
 ; RV32XTHEADBB-NEXT:    sll a6, a0, a7
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB3_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -1442,45 +1442,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32I-LABEL: rotl_64_mask_shared:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a5, a4, 26
-; RV32I-NEXT:    srli a5, a5, 31
-; RV32I-NEXT:    mv a7, a0
-; RV32I-NEXT:    bnez a5, .LBB17_2
+; RV32I-NEXT:    srli t0, a5, 31
+; RV32I-NEXT:    mv a6, a0
+; RV32I-NEXT:    bnez t0, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a7, a1
+; RV32I-NEXT:    mv a6, a1
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    andi a6, a4, 63
-; RV32I-NEXT:    sll t0, a7, a4
-; RV32I-NEXT:    bnez a5, .LBB17_4
+; RV32I-NEXT:    andi a5, a4, 63
+; RV32I-NEXT:    sll a7, a6, a4
+; RV32I-NEXT:    bnez t0, .LBB17_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:  .LBB17_4:
 ; RV32I-NEXT:    srli a0, a1, 1
-; RV32I-NEXT:    not t1, a4
-; RV32I-NEXT:    srl a0, a0, t1
-; RV32I-NEXT:    or a5, t0, a0
-; RV32I-NEXT:    sll a1, a1, a4
-; RV32I-NEXT:    srli a0, a7, 1
-; RV32I-NEXT:    srl a7, a0, t1
-; RV32I-NEXT:    addi a0, a6, -32
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    not t0, a4
+; RV32I-NEXT:    sll t1, a1, a4
+; RV32I-NEXT:    srli a1, a6, 1
+; RV32I-NEXT:    srl a6, a0, t0
+; RV32I-NEXT:    srl t0, a1, t0
+; RV32I-NEXT:    addi a0, a5, -32
+; RV32I-NEXT:    or a1, a7, a6
+; RV32I-NEXT:    or a6, t1, t0
 ; RV32I-NEXT:    bltz a0, .LBB17_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    sll a3, a2, a6
+; RV32I-NEXT:    sll a3, a2, a5
 ; RV32I-NEXT:    j .LBB17_7
 ; RV32I-NEXT:  .LBB17_6:
 ; RV32I-NEXT:    sll a3, a3, a4
 ; RV32I-NEXT:    srli a7, a2, 1
-; RV32I-NEXT:    not a6, a6
-; RV32I-NEXT:    srl a6, a7, a6
-; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    srl a5, a7, a5
+; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:  .LBB17_7:
 ; RV32I-NEXT:    sll a2, a2, a4
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a3, a5, a3
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotl_64_mask_shared:
@@ -1496,45 +1496,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32ZBB-LABEL: rotl_64_mask_shared:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    slli a5, a4, 26
-; RV32ZBB-NEXT:    srli a5, a5, 31
-; RV32ZBB-NEXT:    mv a7, a0
-; RV32ZBB-NEXT:    bnez a5, .LBB17_2
+; RV32ZBB-NEXT:    srli t0, a5, 31
+; RV32ZBB-NEXT:    mv a6, a0
+; RV32ZBB-NEXT:    bnez t0, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    mv a7, a1
+; RV32ZBB-NEXT:    mv a6, a1
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    andi a6, a4, 63
-; RV32ZBB-NEXT:    sll t0, a7, a4
-; RV32ZBB-NEXT:    bnez a5, .LBB17_4
+; RV32ZBB-NEXT:    andi a5, a4, 63
+; RV32ZBB-NEXT:    sll a7, a6, a4
+; RV32ZBB-NEXT:    bnez t0, .LBB17_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    mv a1, a0
 ; RV32ZBB-NEXT:  .LBB17_4:
 ; RV32ZBB-NEXT:    srli a0, a1, 1
-; RV32ZBB-NEXT:    not t1, a4
-; RV32ZBB-NEXT:    srl a0, a0, t1
-; RV32ZBB-NEXT:    or a5, t0, a0
-; RV32ZBB-NEXT:    sll a1, a1, a4
-; RV32ZBB-NEXT:    srli a0, a7, 1
-; RV32ZBB-NEXT:    srl a7, a0, t1
-; RV32ZBB-NEXT:    addi a0, a6, -32
-; RV32ZBB-NEXT:    or a1, a1, a7
+; RV32ZBB-NEXT:    not t0, a4
+; RV32ZBB-NEXT:    sll t1, a1, a4
+; RV32ZBB-NEXT:    srli a1, a6, 1
+; RV32ZBB-NEXT:    srl a6, a0, t0
+; RV32ZBB-NEXT:    srl t0, a1, t0
+; RV32ZBB-NEXT:    addi a0, a5, -32
+; RV32ZBB-NEXT:    or a1, a7, a6
+; RV32ZBB-NEXT:    or a6, t1, t0
 ; RV32ZBB-NEXT:    bltz a0, .LBB17_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    sll a3, a2, a6
+; RV32ZBB-NEXT:    sll a3, a2, a5
 ; RV32ZBB-NEXT:    j .LBB17_7
 ; RV32ZBB-NEXT:  .LBB17_6:
 ; RV32ZBB-NEXT:    sll a3, a3, a4
 ; RV32ZBB-NEXT:    srli a7, a2, 1
-; RV32ZBB-NEXT:    not a6, a6
-; RV32ZBB-NEXT:    srl a6, a7, a6
-; RV32ZBB-NEXT:    or a3, a3, a6
+; RV32ZBB-NEXT:    not a5, a5
+; RV32ZBB-NEXT:    srl a5, a7, a5
+; RV32ZBB-NEXT:    or a3, a3, a5
 ; RV32ZBB-NEXT:  .LBB17_7:
 ; RV32ZBB-NEXT:    sll a2, a2, a4
 ; RV32ZBB-NEXT:    srai a0, a0, 31
 ; RV32ZBB-NEXT:    and a0, a0, a2
-; RV32ZBB-NEXT:    add a0, a1, a0
-; RV32ZBB-NEXT:    sltu a1, a0, a1
-; RV32ZBB-NEXT:    add a3, a5, a3
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add a0, a6, a0
+; RV32ZBB-NEXT:    sltu a2, a0, a6
+; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotl_64_mask_shared:
@@ -1546,45 +1546,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ;
 ; RV32XTHEADBB-LABEL: rotl_64_mask_shared:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    th.extu a5, a4, 5, 5
-; RV32XTHEADBB-NEXT:    mv a7, a0
-; RV32XTHEADBB-NEXT:    bnez a5, .LBB17_2
+; RV32XTHEADBB-NEXT:    th.extu t0, a4, 5, 5
+; RV32XTHEADBB-NEXT:    mv a6, a0
+; RV32XTHEADBB-NEXT:    bnez t0, .LBB17_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
-; RV32XTHEADBB-NEXT:    mv a7, a1
+; RV32XTHEADBB-NEXT:    mv a6, a1
 ; RV32XTHEADBB-NEXT:  .LBB17_2:
-; RV32XTHEADBB-NEXT:    andi a6, a4, 63
-; RV32XTHEADBB-NEXT:    sll t0, a7, a4
-; RV32XTHEADBB-NEXT:    bnez a5, .LBB17_4
+; RV32XTHEADBB-NEXT:    andi a5, a4, 63
+; RV32XTHEADBB-NEXT:    sll a7, a6, a4
+; RV32XTHEADBB-NEXT:    bnez t0, .LBB17_4
 ; RV32XTHEADBB-NEXT:  # %bb.3:
 ; RV32XTHEADBB-NEXT:    mv a1, a0
 ; RV32XTHEADBB-NEXT:  .LBB17_4:
 ; RV32XTHEADBB-NEXT:    srli a0, a1, 1
-; RV32XTHEADBB-NEXT:    not t1, a4
-; RV32XTHEADBB-NEXT:    srl a0, a0, t1
-; RV32XTHEADBB-NEXT:    or a5, t0, a0
-; RV32XTHEADBB-NEXT:    sll a1, a1, a4
-; RV32XTHEADBB-NEXT:    srli a0, a7, 1
-; RV32XTHEADBB-NEXT:    srl a7, a0, t1
-; RV32XTHEADBB-NEXT:    addi a0, a6, -32
-; RV32XTHEADBB-NEXT:    or a1, a1, a7
+; RV32XTHEADBB-NEXT:    not t0, a4
+; RV32XTHEADBB-NEXT:    sll t1, a1, a4
+; RV32XTHEADBB-NEXT:    srli a1, a6, 1
+; RV32XTHEADBB-NEXT:    srl a6, a0, t0
+; RV32XTHEADBB-NEXT:    srl t0, a1, t0
+; RV32XTHEADBB-NEXT:    addi a0, a5, -32
+; RV32XTHEADBB-NEXT:    or a1, a7, a6
+; RV32XTHEADBB-NEXT:    or a6, t1, t0
 ; RV32XTHEADBB-NEXT:    bltz a0, .LBB17_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
-; RV32XTHEADBB-NEXT:    sll a3, a2, a6
+; RV32XTHEADBB-NEXT:    sll a3, a2, a5
 ; RV32XTHEADBB-NEXT:    j .LBB17_7
 ; RV32XTHEADBB-NEXT:  .LBB17_6:
 ; RV32XTHEADBB-NEXT:    sll a3, a3, a4
 ; RV32XTHEADBB-NEXT:    srli a7, a2, 1
-; RV32XTHEADBB-NEXT:    not a6, a6
-; RV32XTHEADBB-NEXT:    srl a6, a7, a6
-; RV32XTHEADBB-NEXT:    or a3, a3, a6
+; RV32XTHEADBB-NEXT:    not a5, a5
+; RV32XTHEADBB-NEXT:    srl a5, a7, a5
+; RV32XTHEADBB-NEXT:    or a3, a3, a5
 ; RV32XTHEADBB-NEXT:  .LBB17_7:
 ; RV32XTHEADBB-NEXT:    sll a2, a2, a4
 ; RV32XTHEADBB-NEXT:    srai a0, a0, 31
 ; RV32XTHEADBB-NEXT:    and a0, a0, a2
-; RV32XTHEADBB-NEXT:    add a0, a1, a0
-; RV32XTHEADBB-NEXT:    sltu a1, a0, a1
-; RV32XTHEADBB-NEXT:    add a3, a5, a3
-; RV32XTHEADBB-NEXT:    add a1, a3, a1
+; RV32XTHEADBB-NEXT:    add a0, a6, a0
+; RV32XTHEADBB-NEXT:    sltu a2, a0, a6
+; RV32XTHEADBB-NEXT:    add a1, a1, a3
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
 ; RV32XTHEADBB-NEXT:    ret
 ;
 ; RV64XTHEADBB-LABEL: rotl_64_mask_shared:
@@ -1669,27 +1669,27 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
 define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind {
 ; RV32I-LABEL: rotr_64_mask_shared:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a7, a4, 32
+; RV32I-NEXT:    andi t0, a4, 32
 ; RV32I-NEXT:    mv a6, a1
-; RV32I-NEXT:    beqz a7, .LBB19_2
+; RV32I-NEXT:    beqz t0, .LBB19_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a6, a0
 ; RV32I-NEXT:  .LBB19_2:
 ; RV32I-NEXT:    andi a5, a4, 63
-; RV32I-NEXT:    srl t0, a6, a4
-; RV32I-NEXT:    beqz a7, .LBB19_4
+; RV32I-NEXT:    srl a7, a6, a4
+; RV32I-NEXT:    beqz t0, .LBB19_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:  .LBB19_4:
 ; RV32I-NEXT:    slli a1, a0, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    srl t0, a0, a4
+; RV32I-NEXT:    not t0, a4
+; RV32I-NEXT:    srl t1, a0, a4
 ; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    sll a1, a1, t0
+; RV32I-NEXT:    sll a6, a6, t0
 ; RV32I-NEXT:    addi a0, a5, -32
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a6, t1
 ; RV32I-NEXT:    bltz a0, .LBB19_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    sll a3, a2, a5
@@ -1722,27 +1722,27 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ;
 ; RV32ZBB-LABEL: rotr_64_mask_shared:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    andi a7, a4, 32
+; RV32ZBB-NEXT:    andi t0, a4, 32
 ; RV32ZBB-NEXT:    mv a6, a1
-; RV32ZBB-NEXT:    beqz a7, .LBB19_2
+; RV32ZBB-NEXT:    beqz t0, .LBB19_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a6, a0
 ; RV32ZBB-NEXT:  .LBB19_2:
 ; RV32ZBB-NEXT:    andi a5, a4, 63
-; RV32ZBB-NEXT:    srl t0, a6, a4
-; RV32ZBB-NEXT:    beqz a7, .LBB19_4
+; RV32ZBB-NEXT:    srl a7, a6, a4
+; RV32ZBB-NEXT:    beqz t0, .LBB19_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    mv a0, a1
 ; RV32ZBB-NEXT:  .LBB19_4:
 ; RV32ZBB-NEXT:    slli a1, a0, 1
-; RV32ZBB-NEXT:    not a7, a4
-; RV32ZBB-NEXT:    sll a1, a1, a7
-; RV32ZBB-NEXT:    or a1, a1, t0
-; RV32ZBB-NEXT:    srl t0, a0, a4
+; RV32ZBB-NEXT:    not t0, a4
+; RV32ZBB-NEXT:    srl t1, a0, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
-; RV32ZBB-NEXT:    sll a6, a6, a7
+; RV32ZBB-NEXT:    sll a1, a1, t0
+; RV32ZBB-NEXT:    sll a6, a6, t0
 ; RV32ZBB-NEXT:    addi a0, a5, -32
-; RV32ZBB-NEXT:    or a6, a6, t0
+; RV32ZBB-NEXT:    or a1, a1, a7
+; RV32ZBB-NEXT:    or a6, a6, t1
 ; RV32ZBB-NEXT:    bltz a0, .LBB19_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    sll a3, a2, a5
@@ -1772,27 +1772,27 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ;
 ; RV32XTHEADBB-LABEL: rotr_64_mask_shared:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    andi a7, a4, 32
+; RV32XTHEADBB-NEXT:    andi t0, a4, 32
 ; RV32XTHEADBB-NEXT:    mv a6, a1
-; RV32XTHEADBB-NEXT:    beqz a7, .LBB19_2
+; RV32XTHEADBB-NEXT:    beqz t0, .LBB19_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a6, a0
 ; RV32XTHEADBB-NEXT:  .LBB19_2:
 ; RV32XTHEADBB-NEXT:    andi a5, a4, 63
-; RV32XTHEADBB-NEXT:    srl t0, a6, a4
-; RV32XTHEADBB-NEXT:    beqz a7, .LBB19_4
+; RV32XTHEADBB-NEXT:    srl a7, a6, a4
+; RV32XTHEADBB-NEXT:    beqz t0, .LBB19_4
 ; RV32XTHEADBB-NEXT:  # %bb.3:
 ; RV32XTHEADBB-NEXT:    mv a0, a1
 ; RV32XTHEADBB-NEXT:  .LBB19_4:
 ; RV32XTHEADBB-NEXT:    slli a1, a0, 1
-; RV32XTHEADBB-NEXT:    not a7, a4
-; RV32XTHEADBB-NEXT:    sll a1, a1, a7
-; RV32XTHEADBB-NEXT:    or a1, a1, t0
-; RV32XTHEADBB-NEXT:    srl t0, a0, a4
+; RV32XTHEADBB-NEXT:    not t0, a4
+; RV32XTHEADBB-NEXT:    srl t1, a0, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
-; RV32XTHEADBB-NEXT:    sll a6, a6, a7
+; RV32XTHEADBB-NEXT:    sll a1, a1, t0
+; RV32XTHEADBB-NEXT:    sll a6, a6, t0
 ; RV32XTHEADBB-NEXT:    addi a0, a5, -32
-; RV32XTHEADBB-NEXT:    or a6, a6, t0
+; RV32XTHEADBB-NEXT:    or a1, a1, a7
+; RV32XTHEADBB-NEXT:    or a6, a6, t1
 ; RV32XTHEADBB-NEXT:    bltz a0, .LBB19_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    sll a3, a2, a5
@@ -1835,10 +1835,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sll a3, a0, a2
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    srl a0, a0, a4
-; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    sll a2, a1, a2
+; RV32I-NEXT:    srl a0, a0, a4
 ; RV32I-NEXT:    srl a1, a1, a4
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -1847,10 +1847,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    srlw a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sllw a2, a1, a2
+; RV64I-NEXT:    srlw a0, a0, a4
 ; RV64I-NEXT:    srlw a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1873,10 +1873,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    sll a3, a0, a2
 ; RV32XTHEADBB-NEXT:    neg a4, a2
-; RV32XTHEADBB-NEXT:    srl a0, a0, a4
-; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    sll a2, a1, a2
+; RV32XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV32XTHEADBB-NEXT:    srl a1, a1, a4
+; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    or a1, a2, a1
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    ret
@@ -1885,10 +1885,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    srlw a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sllw a2, a1, a2
+; RV64XTHEADBB-NEXT:    srlw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    srlw a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    addw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -1914,45 +1914,45 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:  .LBB21_4:
 ; RV32I-NEXT:    sll a7, a6, a4
-; RV32I-NEXT:    srli t0, a0, 1
+; RV32I-NEXT:    srli t1, a0, 1
 ; RV32I-NEXT:    not a1, a4
-; RV32I-NEXT:    srl t0, t0, a1
-; RV32I-NEXT:    sll t1, a0, a4
+; RV32I-NEXT:    sll t0, a0, a4
 ; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    srl t2, a0, a1
+; RV32I-NEXT:    srl a6, t1, a1
+; RV32I-NEXT:    srl t1, a0, a1
 ; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    bnez a5, .LBB21_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB21_6:
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, t2
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, t0, t1
 ; RV32I-NEXT:    sll t0, a0, a4
 ; RV32I-NEXT:    bnez a5, .LBB21_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a2, a3
 ; RV32I-NEXT:  .LBB21_8:
 ; RV32I-NEXT:    srli a3, a2, 1
-; RV32I-NEXT:    srl a3, a3, a1
-; RV32I-NEXT:    or a3, t0, a3
 ; RV32I-NEXT:    sll a2, a2, a4
 ; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a3, a3, a1
 ; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    or a1, t0, a3
 ; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    add a1, a7, a0
-; RV32I-NEXT:    add a0, a6, a3
-; RV32I-NEXT:    sltu a2, a0, a6
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a7, a7, a0
+; RV32I-NEXT:    add a0, a6, a1
+; RV32I-NEXT:    sltu a1, a0, a6
+; RV32I-NEXT:    add a1, a7, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotl_64_mask_multiple:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    srl a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sll a2, a1, a2
+; RV64I-NEXT:    srl a0, a0, a4
 ; RV64I-NEXT:    srl a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1971,35 +1971,35 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    mv a0, a1
 ; RV32ZBB-NEXT:  .LBB21_4:
 ; RV32ZBB-NEXT:    sll a7, a6, a4
-; RV32ZBB-NEXT:    srli t0, a0, 1
+; RV32ZBB-NEXT:    srli t1, a0, 1
 ; RV32ZBB-NEXT:    not a1, a4
-; RV32ZBB-NEXT:    srl t0, t0, a1
-; RV32ZBB-NEXT:    sll t1, a0, a4
+; RV32ZBB-NEXT:    sll t0, a0, a4
 ; RV32ZBB-NEXT:    srli a0, a6, 1
-; RV32ZBB-NEXT:    srl t2, a0, a1
+; RV32ZBB-NEXT:    srl a6, t1, a1
+; RV32ZBB-NEXT:    srl t1, a0, a1
 ; RV32ZBB-NEXT:    mv a0, a3
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv a0, a2
 ; RV32ZBB-NEXT:  .LBB21_6:
-; RV32ZBB-NEXT:    or a6, a7, t0
-; RV32ZBB-NEXT:    or a7, t1, t2
+; RV32ZBB-NEXT:    or a6, a7, a6
+; RV32ZBB-NEXT:    or a7, t0, t1
 ; RV32ZBB-NEXT:    sll t0, a0, a4
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_8
 ; RV32ZBB-NEXT:  # %bb.7:
 ; RV32ZBB-NEXT:    mv a2, a3
 ; RV32ZBB-NEXT:  .LBB21_8:
 ; RV32ZBB-NEXT:    srli a3, a2, 1
-; RV32ZBB-NEXT:    srl a3, a3, a1
-; RV32ZBB-NEXT:    or a3, t0, a3
 ; RV32ZBB-NEXT:    sll a2, a2, a4
 ; RV32ZBB-NEXT:    srli a0, a0, 1
+; RV32ZBB-NEXT:    srl a3, a3, a1
 ; RV32ZBB-NEXT:    srl a0, a0, a1
+; RV32ZBB-NEXT:    or a1, t0, a3
 ; RV32ZBB-NEXT:    or a0, a2, a0
-; RV32ZBB-NEXT:    add a1, a7, a0
-; RV32ZBB-NEXT:    add a0, a6, a3
-; RV32ZBB-NEXT:    sltu a2, a0, a6
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    add a7, a7, a0
+; RV32ZBB-NEXT:    add a0, a6, a1
+; RV32ZBB-NEXT:    sltu a1, a0, a6
+; RV32ZBB-NEXT:    add a1, a7, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotl_64_mask_multiple:
@@ -2022,45 +2022,45 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32XTHEADBB-NEXT:    mv a0, a1
 ; RV32XTHEADBB-NEXT:  .LBB21_4:
 ; RV32XTHEADBB-NEXT:    sll a7, a6, a4
-; RV32XTHEADBB-NEXT:    srli t0, a0, 1
+; RV32XTHEADBB-NEXT:    srli t1, a0, 1
 ; RV32XTHEADBB-NEXT:    not a1, a4
-; RV32XTHEADBB-NEXT:    srl t0, t0, a1
-; RV32XTHEADBB-NEXT:    sll t1, a0, a4
+; RV32XTHEADBB-NEXT:    sll t0, a0, a4
 ; RV32XTHEADBB-NEXT:    srli a0, a6, 1
-; RV32XTHEADBB-NEXT:    srl t2, a0, a1
+; RV32XTHEADBB-NEXT:    srl a6, t1, a1
+; RV32XTHEADBB-NEXT:    srl t1, a0, a1
 ; RV32XTHEADBB-NEXT:    mv a0, a3
 ; RV32XTHEADBB-NEXT:    bnez a5, .LBB21_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    mv a0, a2
 ; RV32XTHEADBB-NEXT:  .LBB21_6:
-; RV32XTHEADBB-NEXT:    or a6, a7, t0
-; RV32XTHEADBB-NEXT:    or a7, t1, t2
+; RV32XTHEADBB-NEXT:    or a6, a7, a6
+; RV32XTHEADBB-NEXT:    or a7, t0, t1
 ; RV32XTHEADBB-NEXT:    sll t0, a0, a4
 ; RV32XTHEADBB-NEXT:    bnez a5, .LBB21_8
 ; RV32XTHEADBB-NEXT:  # %bb.7:
 ; RV32XTHEADBB-NEXT:    mv a2, a3
 ; RV32XTHEADBB-NEXT:  .LBB21_8:
 ; RV32XTHEADBB-NEXT:    srli a3, a2, 1
-; RV32XTHEADBB-NEXT:    srl a3, a3, a1
-; RV32XTHEADBB-NEXT:    or a3, t0, a3
 ; RV32XTHEADBB-NEXT:    sll a2, a2, a4
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 1
+; RV32XTHEADBB-NEXT:    srl a3, a3, a1
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a1
+; RV32XTHEADBB-NEXT:    or a1, t0, a3
 ; RV32XTHEADBB-NEXT:    or a0, a2, a0
-; RV32XTHEADBB-NEXT:    add a1, a7, a0
-; RV32XTHEADBB-NEXT:    add a0, a6, a3
-; RV32XTHEADBB-NEXT:    sltu a2, a0, a6
-; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a7, a7, a0
+; RV32XTHEADBB-NEXT:    add a0, a6, a1
+; RV32XTHEADBB-NEXT:    sltu a1, a0, a6
+; RV32XTHEADBB-NEXT:    add a1, a7, a1
 ; RV32XTHEADBB-NEXT:    ret
 ;
 ; RV64XTHEADBB-LABEL: rotl_64_mask_multiple:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    srl a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sll a2, a1, a2
+; RV64XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV64XTHEADBB-NEXT:    srl a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -2076,10 +2076,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srl a3, a0, a2
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    sll a0, a0, a4
-; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srl a2, a1, a2
+; RV32I-NEXT:    sll a0, a0, a4
 ; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -2088,10 +2088,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    sllw a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    srlw a2, a1, a2
+; RV64I-NEXT:    sllw a0, a0, a4
 ; RV64I-NEXT:    sllw a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2114,10 +2114,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    srl a3, a0, a2
 ; RV32XTHEADBB-NEXT:    neg a4, a2
-; RV32XTHEADBB-NEXT:    sll a0, a0, a4
-; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    srl a2, a1, a2
+; RV32XTHEADBB-NEXT:    sll a0, a0, a4
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a4
+; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    or a1, a2, a1
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    ret
@@ -2126,10 +2126,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    sllw a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    srlw a2, a1, a2
+; RV64XTHEADBB-NEXT:    sllw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    addw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -2154,30 +2154,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:  .LBB23_4:
 ; RV32I-NEXT:    srl a7, a6, a4
-; RV32I-NEXT:    slli t0, a1, 1
+; RV32I-NEXT:    slli t1, a1, 1
 ; RV32I-NEXT:    not a0, a4
-; RV32I-NEXT:    sll t0, t0, a0
-; RV32I-NEXT:    srl t1, a1, a4
+; RV32I-NEXT:    srl t0, a1, a4
 ; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    sll t2, a6, a0
+; RV32I-NEXT:    sll a1, t1, a0
+; RV32I-NEXT:    sll t1, a6, a0
 ; RV32I-NEXT:    mv a6, a2
 ; RV32I-NEXT:    beqz a5, .LBB23_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv a6, a3
 ; RV32I-NEXT:  .LBB23_6:
-; RV32I-NEXT:    or a1, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    srl t0, a6, a4
 ; RV32I-NEXT:    beqz a5, .LBB23_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a3, a2
 ; RV32I-NEXT:  .LBB23_8:
 ; RV32I-NEXT:    slli a2, a3, 1
-; RV32I-NEXT:    sll a2, a2, a0
-; RV32I-NEXT:    or a2, a2, t0
 ; RV32I-NEXT:    srl a3, a3, a4
 ; RV32I-NEXT:    slli a6, a6, 1
+; RV32I-NEXT:    sll a2, a2, a0
 ; RV32I-NEXT:    sll a0, a6, a0
+; RV32I-NEXT:    or a2, a2, t0
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    add a7, a7, a0
 ; RV32I-NEXT:    add a0, a1, a2
@@ -2189,10 +2189,10 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    sll a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    srl a2, a1, a2
+; RV64I-NEXT:    sll a0, a0, a4
 ; RV64I-NEXT:    sll a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2210,30 +2210,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    mv a1, a0
 ; RV32ZBB-NEXT:  .LBB23_4:
 ; RV32ZBB-NEXT:    srl a7, a6, a4
-; RV32ZBB-NEXT:    slli t0, a1, 1
+; RV32ZBB-NEXT:    slli t1, a1, 1
 ; RV32ZBB-NEXT:    not a0, a4
-; RV32ZBB-NEXT:    sll t0, t0, a0
-; RV32ZBB-NEXT:    srl t1, a1, a4
+; RV32ZBB-NEXT:    srl t0, a1, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
-; RV32ZBB-NEXT:    sll t2, a6, a0
+; RV32ZBB-NEXT:    sll a1, t1, a0
+; RV32ZBB-NEXT:    sll t1, a6, a0
 ; RV32ZBB-NEXT:    mv a6, a2
 ; RV32ZBB-NEXT:    beqz a5, .LBB23_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv a6, a3
 ; RV32ZBB-NEXT:  .LBB23_6:
-; RV32ZBB-NEXT:    or a1, t0, a7
-; RV32ZBB-NEXT:    or a7, t2, t1
+; RV32ZBB-NEXT:    or a1, a1, a7
+; RV32ZBB-NEXT:    or a7, t1, t0
 ; RV32ZBB-NEXT:    srl t0, a6, a4
 ; RV32ZBB-NEXT:    beqz a5, .LBB23_8
 ; RV32ZBB-NEXT:  # %bb.7:
 ; RV32ZBB-NEXT:    mv a3, a2
 ; RV32ZBB-NEXT:  .LBB23_8:
 ; RV32ZBB-NEXT:    slli a2, a3, 1
-; RV32ZBB-NEXT:    sll a2, a2, a0
-; RV32ZBB-NEXT:    or a2, a2, t0
 ; RV32ZBB-NEXT:    srl a3, a3, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
+; RV32ZBB-NEXT:    sll a2, a2, a0
 ; RV32ZBB-NEXT:    sll a0, a6, a0
+; RV32ZBB-NEXT:    or a2, a2, t0
 ; RV32ZBB-NEXT:    or a0, a0, a3
 ; RV32ZBB-NEXT:    add a7, a7, a0
 ; RV32ZBB-NEXT:    add a0, a1, a2
@@ -2261,30 +2261,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32XTHEADBB-NEXT:    mv a1, a0
 ; RV32XTHEADBB-NEXT:  .LBB23_4:
 ; RV32XTHEADBB-NEXT:    srl a7, a6, a4
-; RV32XTHEADBB-NEXT:    slli t0, a1, 1
+; RV32XTHEADBB-NEXT:    slli t1, a1, 1
 ; RV32XTHEADBB-NEXT:    not a0, a4
-; RV32XTHEADBB-NEXT:    sll t0, t0, a0
-; RV32XTHEADBB-NEXT:    srl t1, a1, a4
+; RV32XTHEADBB-NEXT:    srl t0, a1, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
-; RV32XTHEADBB-NEXT:    sll t2, a6, a0
+; RV32XTHEADBB-NEXT:    sll a1, t1, a0
+; RV32XTHEADBB-NEXT:    sll t1, a6, a0
 ; RV32XTHEADBB-NEXT:    mv a6, a2
 ; RV32XTHEADBB-NEXT:    beqz a5, .LBB23_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    mv a6, a3
 ; RV32XTHEADBB-NEXT:  .LBB23_6:
-; RV32XTHEADBB-NEXT:    or a1, t0, a7
-; RV32XTHEADBB-NEXT:    or a7, t2, t1
+; RV32XTHEADBB-NEXT:    or a1, a1, a7
+; RV32XTHEADBB-NEXT:    or a7, t1, t0
 ; RV32XTHEADBB-NEXT:    srl t0, a6, a4
 ; RV32XTHEADBB-NEXT:    beqz a5, .LBB23_8
 ; RV32XTHEADBB-NEXT:  # %bb.7:
 ; RV32XTHEADBB-NEXT:    mv a3, a2
 ; RV32XTHEADBB-NEXT:  .LBB23_8:
 ; RV32XTHEADBB-NEXT:    slli a2, a3, 1
-; RV32XTHEADBB-NEXT:    sll a2, a2, a0
-; RV32XTHEADBB-NEXT:    or a2, a2, t0
 ; RV32XTHEADBB-NEXT:    srl a3, a3, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
+; RV32XTHEADBB-NEXT:    sll a2, a2, a0
 ; RV32XTHEADBB-NEXT:    sll a0, a6, a0
+; RV32XTHEADBB-NEXT:    or a2, a2, t0
 ; RV32XTHEADBB-NEXT:    or a0, a0, a3
 ; RV32XTHEADBB-NEXT:    add a7, a7, a0
 ; RV32XTHEADBB-NEXT:    add a0, a1, a2
@@ -2296,10 +2296,10 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    sll a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    srl a2, a1, a2
+; RV64XTHEADBB-NEXT:    sll a0, a0, a4
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -2328,9 +2328,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a7
 ; RV32I-NEXT:  .LBB24_3:
 ; RV32I-NEXT:    srai a6, a6, 31
+; RV32I-NEXT:    li a7, 32
 ; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    li a6, 32
-; RV32I-NEXT:    sub a6, a6, a2
+; RV32I-NEXT:    sub a6, a7, a2
 ; RV32I-NEXT:    srl a7, a1, a4
 ; RV32I-NEXT:    bltz a6, .LBB24_5
 ; RV32I-NEXT:  # %bb.4:
@@ -2338,8 +2338,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    j .LBB24_6
 ; RV32I-NEXT:  .LBB24_5:
 ; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    sll a1, a1, a2
@@ -2376,9 +2376,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a7
 ; RV32ZBB-NEXT:  .LBB24_3:
 ; RV32ZBB-NEXT:    srai a6, a6, 31
+; RV32ZBB-NEXT:    li a7, 32
 ; RV32ZBB-NEXT:    and a5, a6, a5
-; RV32ZBB-NEXT:    li a6, 32
-; RV32ZBB-NEXT:    sub a6, a6, a2
+; RV32ZBB-NEXT:    sub a6, a7, a2
 ; RV32ZBB-NEXT:    srl a7, a1, a4
 ; RV32ZBB-NEXT:    bltz a6, .LBB24_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -2386,8 +2386,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    j .LBB24_6
 ; RV32ZBB-NEXT:  .LBB24_5:
 ; RV32ZBB-NEXT:    li t0, 64
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    srl a0, a0, a4
+; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    not a2, a2
 ; RV32ZBB-NEXT:    slli a1, a1, 1
 ; RV32ZBB-NEXT:    sll a1, a1, a2
@@ -2421,9 +2421,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a7
 ; RV32XTHEADBB-NEXT:  .LBB24_3:
 ; RV32XTHEADBB-NEXT:    srai a6, a6, 31
+; RV32XTHEADBB-NEXT:    li a7, 32
 ; RV32XTHEADBB-NEXT:    and a5, a6, a5
-; RV32XTHEADBB-NEXT:    li a6, 32
-; RV32XTHEADBB-NEXT:    sub a6, a6, a2
+; RV32XTHEADBB-NEXT:    sub a6, a7, a2
 ; RV32XTHEADBB-NEXT:    srl a7, a1, a4
 ; RV32XTHEADBB-NEXT:    bltz a6, .LBB24_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -2431,8 +2431,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    j .LBB24_6
 ; RV32XTHEADBB-NEXT:  .LBB24_5:
 ; RV32XTHEADBB-NEXT:    li t0, 64
-; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a4
+; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    not a2, a2
 ; RV32XTHEADBB-NEXT:    slli a1, a1, 1
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a2
@@ -2478,9 +2478,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a7
 ; RV32I-NEXT:  .LBB25_3:
 ; RV32I-NEXT:    srai a6, a6, 31
+; RV32I-NEXT:    li a7, 32
 ; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    li a6, 32
-; RV32I-NEXT:    sub a6, a6, a2
+; RV32I-NEXT:    sub a6, a7, a2
 ; RV32I-NEXT:    sll a7, a0, a4
 ; RV32I-NEXT:    bltz a6, .LBB25_5
 ; RV32I-NEXT:  # %bb.4:
@@ -2488,8 +2488,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    j .LBB25_6
 ; RV32I-NEXT:  .LBB25_5:
 ; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    srl a0, a0, a2
@@ -2526,9 +2526,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a7
 ; RV32ZBB-NEXT:  .LBB25_3:
 ; RV32ZBB-NEXT:    srai a6, a6, 31
+; RV32ZBB-NEXT:    li a7, 32
 ; RV32ZBB-NEXT:    and a5, a6, a5
-; RV32ZBB-NEXT:    li a6, 32
-; RV32ZBB-NEXT:    sub a6, a6, a2
+; RV32ZBB-NEXT:    sub a6, a7, a2
 ; RV32ZBB-NEXT:    sll a7, a0, a4
 ; RV32ZBB-NEXT:    bltz a6, .LBB25_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -2536,8 +2536,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    j .LBB25_6
 ; RV32ZBB-NEXT:  .LBB25_5:
 ; RV32ZBB-NEXT:    li t0, 64
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sll a1, a1, a4
+; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    not a2, a2
 ; RV32ZBB-NEXT:    srli a0, a0, 1
 ; RV32ZBB-NEXT:    srl a0, a0, a2
@@ -2571,9 +2571,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a7
 ; RV32XTHEADBB-NEXT:  .LBB25_3:
 ; RV32XTHEADBB-NEXT:    srai a6, a6, 31
+; RV32XTHEADBB-NEXT:    li a7, 32
 ; RV32XTHEADBB-NEXT:    and a5, a6, a5
-; RV32XTHEADBB-NEXT:    li a6, 32
-; RV32XTHEADBB-NEXT:    sub a6, a6, a2
+; RV32XTHEADBB-NEXT:    sub a6, a7, a2
 ; RV32XTHEADBB-NEXT:    sll a7, a0, a4
 ; RV32XTHEADBB-NEXT:    bltz a6, .LBB25_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -2581,8 +2581,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    j .LBB25_6
 ; RV32XTHEADBB-NEXT:  .LBB25_5:
 ; RV32XTHEADBB-NEXT:    li t0, 64
-; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a4
+; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    not a2, a2
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 1
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index 248d6209d58239..4bb8d6c248caa2 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -62,11 +62,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -440,11 +440,11 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
@@ -466,25 +466,24 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    srli a5, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a2, a1, a2
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a1, a2
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV32XTHEADBB-LABEL: bswap_i64:
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
index 87f6f62ce68ddd..b6344f88cddaa5 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
@@ -140,25 +140,24 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    slli a5, a2, 26
 ; CHECK-NEXT:    srli a5, a5, 31
-; CHECK-NEXT:    mv a4, a1
+; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    bnez a5, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a0
+; CHECK-NEXT:    mv a3, a0
 ; CHECK-NEXT:  .LBB7_2:
-; CHECK-NEXT:    sll a3, a4, a2
+; CHECK-NEXT:    sll a4, a3, a2
 ; CHECK-NEXT:    bnez a5, .LBB7_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB7_4:
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    not a5, a2
-; CHECK-NEXT:    srl a1, a1, a5
-; CHECK-NEXT:    or a3, a3, a1
-; CHECK-NEXT:    sll a0, a0, a2
-; CHECK-NEXT:    srli a4, a4, 1
-; CHECK-NEXT:    srl a1, a4, a5
-; CHECK-NEXT:    or a1, a0, a1
-; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    sll a2, a0, a2
+; CHECK-NEXT:    srli a3, a3, 1
+; CHECK-NEXT:    srl a0, a1, a5
+; CHECK-NEXT:    srl a1, a3, a5
+; CHECK-NEXT:    or a0, a4, a0
+; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    ret
   %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
   ret i64 %or
@@ -191,24 +190,24 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
 define i64 @ror_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: ror_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a4, a2, 32
+; CHECK-NEXT:    andi a5, a2, 32
 ; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    beqz a4, .LBB9_2
+; CHECK-NEXT:    beqz a5, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB9_2:
-; CHECK-NEXT:    srl a5, a3, a2
-; CHECK-NEXT:    beqz a4, .LBB9_4
+; CHECK-NEXT:    srl a4, a3, a2
+; CHECK-NEXT:    beqz a5, .LBB9_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB9_4:
 ; CHECK-NEXT:    slli a0, a1, 1
-; CHECK-NEXT:    not a4, a2
-; CHECK-NEXT:    sll a0, a0, a4
-; CHECK-NEXT:    or a0, a0, a5
+; CHECK-NEXT:    not a5, a2
 ; CHECK-NEXT:    srl a1, a1, a2
 ; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    sll a2, a3, a4
+; CHECK-NEXT:    sll a0, a0, a5
+; CHECK-NEXT:    sll a2, a3, a5
+; CHECK-NEXT:    or a0, a0, a4
 ; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    ret
   %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
@@ -252,11 +251,10 @@ define i64 @rori_i64(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    slli a3, a1, 31
-; CHECK-NEXT:    or a2, a3, a2
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    slli a0, a0, 31
-; CHECK-NEXT:    or a1, a0, a1
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    slli a4, a0, 31
+; CHECK-NEXT:    or a0, a3, a2
+; CHECK-NEXT:    or a1, a4, a1
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
   ret i64 %1
@@ -267,11 +265,10 @@ define i64 @rori_i64_fshr(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srli a2, a1, 31
 ; CHECK-NEXT:    slli a3, a0, 1
-; CHECK-NEXT:    or a2, a3, a2
-; CHECK-NEXT:    srli a0, a0, 31
+; CHECK-NEXT:    srli a4, a0, 31
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    or a1, a1, a0
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    or a0, a3, a2
+; CHECK-NEXT:    or a1, a1, a4
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
   ret i64 %1
@@ -299,12 +296,12 @@ define i64 @not_shl_one_i64(i64 %x) {
 ; CHECK-LABEL: not_shl_one_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, -32
+; CHECK-NEXT:    li a2, 1
 ; CHECK-NEXT:    slti a1, a1, 0
+; CHECK-NEXT:    sll a0, a2, a0
 ; CHECK-NEXT:    neg a2, a1
-; CHECK-NEXT:    li a3, 1
-; CHECK-NEXT:    sll a0, a3, a0
-; CHECK-NEXT:    and a2, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a2, a2, a0
 ; CHECK-NEXT:    and a1, a1, a0
 ; CHECK-NEXT:    not a0, a2
 ; CHECK-NEXT:    not a1, a1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index af2ea35cf26c1b..90a8eadb3f974d 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -62,11 +62,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -257,17 +257,17 @@ define i32 @ctpop_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -367,39 +367,39 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a4, a0, a2
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    srli a4, a0, 4
-; RV32I-NEXT:    add a0, a0, a4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    and a3, a1, a2
+; RV32I-NEXT:    and a2, a0, a4
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a3, a1, a4
 ; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a4
 ; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    ret
 ;
@@ -417,9 +417,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    ret
 ;
@@ -440,9 +440,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    ret
 ;
@@ -451,8 +451,8 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a0, a0
 ; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
+; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    xori a1, a1, 1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -476,8 +476,8 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a0, a0
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    seqz a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -491,10 +491,10 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    xor a1, a1, a2
 ; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    xori a1, a1, 1
 ; RV32I-NEXT:    ret
 ;
@@ -503,8 +503,8 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a0, a0
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    snez a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -519,39 +519,39 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a4, a1, a2
-; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 4
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a5, a1, 8
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 16
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    sub a0, a0, a3
-; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a4
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a3, a0, a4
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    slli a2, a1, 8
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    slli a2, a1, 16
+; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    li a1, 0
@@ -682,77 +682,77 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
 define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srli a5, a3, 1
-; RV32I-NEXT:    lui a6, 349525
-; RV32I-NEXT:    addi a6, a6, 1365
-; RV32I-NEXT:    and a5, a5, a6
-; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    lui a5, 209715
-; RV32I-NEXT:    addi a5, a5, 819
-; RV32I-NEXT:    and a7, a3, a5
+; RV32I-NEXT:    lui a5, 349525
+; RV32I-NEXT:    addi a5, a5, 1365
+; RV32I-NEXT:    srli a6, a4, 1
+; RV32I-NEXT:    srli a7, a3, 1
+; RV32I-NEXT:    srli t0, a1, 1
+; RV32I-NEXT:    srli t1, a2, 1
+; RV32I-NEXT:    and a6, a6, a5
+; RV32I-NEXT:    and a7, a7, a5
+; RV32I-NEXT:    and t0, t0, a5
+; RV32I-NEXT:    and a5, t1, a5
+; RV32I-NEXT:    lui t1, 209715
+; RV32I-NEXT:    addi t1, t1, 819
+; RV32I-NEXT:    sub a4, a4, a6
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    and a5, a4, t1
+; RV32I-NEXT:    srli a4, a4, 2
+; RV32I-NEXT:    and a6, a3, t1
 ; RV32I-NEXT:    srli a3, a3, 2
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    add a3, a7, a3
+; RV32I-NEXT:    and a7, a1, t1
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and t0, a2, t1
+; RV32I-NEXT:    srli a2, a2, 2
+; RV32I-NEXT:    and a4, a4, t1
+; RV32I-NEXT:    and a3, a3, t1
+; RV32I-NEXT:    and a1, a1, t1
+; RV32I-NEXT:    and a2, a2, t1
+; RV32I-NEXT:    add a4, a5, a4
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    add a3, a6, a3
+; RV32I-NEXT:    add a1, a7, a1
+; RV32I-NEXT:    add a2, t0, a2
+; RV32I-NEXT:    srli a6, a4, 4
 ; RV32I-NEXT:    srli a7, a3, 4
+; RV32I-NEXT:    srli t0, a1, 4
+; RV32I-NEXT:    add a4, a4, a6
+; RV32I-NEXT:    srli a6, a2, 4
 ; RV32I-NEXT:    add a3, a3, a7
-; RV32I-NEXT:    lui a7, 61681
-; RV32I-NEXT:    addi a7, a7, -241
-; RV32I-NEXT:    and a3, a3, a7
-; RV32I-NEXT:    slli t0, a3, 8
-; RV32I-NEXT:    add a3, a3, t0
-; RV32I-NEXT:    slli t0, a3, 16
-; RV32I-NEXT:    add a3, a3, t0
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    srli t0, a4, 1
-; RV32I-NEXT:    and t0, t0, a6
-; RV32I-NEXT:    sub a4, a4, t0
-; RV32I-NEXT:    and t0, a4, a5
-; RV32I-NEXT:    srli a4, a4, 2
+; RV32I-NEXT:    add a1, a1, t0
+; RV32I-NEXT:    add a2, a2, a6
 ; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    add a4, t0, a4
-; RV32I-NEXT:    srli t0, a4, 4
-; RV32I-NEXT:    add a4, a4, t0
-; RV32I-NEXT:    and a4, a4, a7
-; RV32I-NEXT:    slli t0, a4, 8
-; RV32I-NEXT:    add a4, a4, t0
-; RV32I-NEXT:    slli t0, a4, 16
-; RV32I-NEXT:    add a4, a4, t0
-; RV32I-NEXT:    srli a4, a4, 24
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    srli a4, a1, 1
-; RV32I-NEXT:    and a4, a4, a6
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    and a4, a1, a5
-; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a3, a3, a5
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 4
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    and a1, a1, a7
-; RV32I-NEXT:    slli a4, a1, 8
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    slli a4, a1, 16
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    srli a4, a2, 1
-; RV32I-NEXT:    and a4, a4, a6
-; RV32I-NEXT:    sub a2, a2, a4
-; RV32I-NEXT:    and a4, a2, a5
-; RV32I-NEXT:    srli a2, a2, 2
 ; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    add a2, a4, a2
-; RV32I-NEXT:    srli a4, a2, 4
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    and a2, a2, a7
-; RV32I-NEXT:    slli a4, a2, 8
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    slli a4, a2, 16
-; RV32I-NEXT:    add a2, a2, a4
+; RV32I-NEXT:    slli a5, a4, 8
+; RV32I-NEXT:    slli a6, a3, 8
+; RV32I-NEXT:    slli a7, a1, 8
+; RV32I-NEXT:    slli t0, a2, 8
+; RV32I-NEXT:    add a4, a4, a5
+; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    add a1, a1, a7
+; RV32I-NEXT:    add a2, a2, t0
+; RV32I-NEXT:    slli a5, a4, 16
+; RV32I-NEXT:    slli a6, a3, 16
+; RV32I-NEXT:    slli a7, a1, 16
+; RV32I-NEXT:    slli t0, a2, 16
+; RV32I-NEXT:    add a4, a4, a5
+; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    add a1, a1, a7
+; RV32I-NEXT:    add a2, a2, t0
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    add a3, a3, a4
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw zero, 4(a0)
@@ -764,14 +764,14 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 4(a1)
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    cpop a2, a2
 ; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    cpop a4, a4
 ; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a3, a4
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    add a1, a1, a4
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw zero, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
@@ -787,35 +787,35 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    lw a1, 0(a0)
 ; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a4, 12(a0)
-; RV32I-NEXT:    addi a0, a1, -1
-; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    addi a4, a1, -1
+; RV32I-NEXT:    and a4, a1, a4
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    and a1, a2, a1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    addi a1, a3, -1
-; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a2, a3
-; RV32I-NEXT:    sub a2, a4, a2
-; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    seqz a1, a1
+; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    seqz a3, a3
+; RV32I-NEXT:    sub a3, a0, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a2, a2, a0
+; RV32I-NEXT:    seqz a0, a1
+; RV32I-NEXT:    seqz a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ult_two:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    sltiu a0, a0, 2
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
 ; RV32ZBB-NEXT:    ret
@@ -830,38 +830,38 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    lw a1, 0(a0)
 ; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a4, 12(a0)
-; RV32I-NEXT:    addi a0, a1, -1
-; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    addi a4, a1, -1
+; RV32I-NEXT:    and a4, a1, a4
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    and a1, a2, a1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    addi a1, a3, -1
-; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a2, a3
-; RV32I-NEXT:    sub a2, a4, a2
-; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    seqz a3, a3
+; RV32I-NEXT:    sub a3, a0, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a2, a2, a0
+; RV32I-NEXT:    snez a0, a1
+; RV32I-NEXT:    snez a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ugt_one:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
+; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    xori a1, a1, 1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -906,17 +906,17 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    seqz a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -963,17 +963,17 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    snez a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1300,11 +1300,11 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
@@ -1326,25 +1326,24 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    srli a5, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a2, a1, a2
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a1, a2
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: bswap_i64:
@@ -1405,12 +1404,12 @@ define i64 @orc_b_i64(i64 %a) {
 ; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    slli a2, a0, 8
-; CHECK-NEXT:    sltu a3, a2, a0
-; CHECK-NEXT:    srli a4, a0, 24
-; CHECK-NEXT:    slli a5, a1, 8
-; CHECK-NEXT:    or a4, a5, a4
-; CHECK-NEXT:    sub a1, a4, a1
-; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    srli a3, a0, 24
+; CHECK-NEXT:    slli a4, a1, 8
+; CHECK-NEXT:    sltu a5, a2, a0
+; CHECK-NEXT:    or a3, a4, a3
+; CHECK-NEXT:    sub a1, a3, a1
+; CHECK-NEXT:    sub a1, a1, a5
 ; CHECK-NEXT:    sub a0, a2, a0
 ; CHECK-NEXT:    ret
   %1 = and i64 %a, 72340172838076673
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index c0b9e0b3c7748e..1a3beeb79b85bd 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -49,14 +49,14 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: bclr_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a3, a2, 63
-; RV32I-NEXT:    addi a4, a3, -32
-; RV32I-NEXT:    slti a4, a4, 0
+; RV32I-NEXT:    li a4, 1
+; RV32I-NEXT:    addi a5, a3, -32
+; RV32I-NEXT:    sll a2, a4, a2
+; RV32I-NEXT:    sll a3, a4, a3
+; RV32I-NEXT:    slti a4, a5, 0
 ; RV32I-NEXT:    neg a5, a4
-; RV32I-NEXT:    li a6, 1
-; RV32I-NEXT:    sll a2, a6, a2
-; RV32I-NEXT:    and a2, a5, a2
-; RV32I-NEXT:    sll a3, a6, a3
 ; RV32I-NEXT:    addi a4, a4, -1
+; RV32I-NEXT:    and a2, a5, a2
 ; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    not a3, a3
@@ -67,13 +67,13 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBSNOZBB-LABEL: bclr_i64:
 ; RV32ZBSNOZBB:       # %bb.0:
 ; RV32ZBSNOZBB-NEXT:    andi a3, a2, 63
+; RV32ZBSNOZBB-NEXT:    bset a2, zero, a2
 ; RV32ZBSNOZBB-NEXT:    addi a4, a3, -32
+; RV32ZBSNOZBB-NEXT:    bset a3, zero, a3
 ; RV32ZBSNOZBB-NEXT:    slti a4, a4, 0
 ; RV32ZBSNOZBB-NEXT:    neg a5, a4
-; RV32ZBSNOZBB-NEXT:    bset a2, zero, a2
-; RV32ZBSNOZBB-NEXT:    and a2, a5, a2
-; RV32ZBSNOZBB-NEXT:    bset a3, zero, a3
 ; RV32ZBSNOZBB-NEXT:    addi a4, a4, -1
+; RV32ZBSNOZBB-NEXT:    and a2, a5, a2
 ; RV32ZBSNOZBB-NEXT:    and a3, a4, a3
 ; RV32ZBSNOZBB-NEXT:    not a3, a3
 ; RV32ZBSNOZBB-NEXT:    not a2, a2
@@ -84,13 +84,13 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBSZBB-LABEL: bclr_i64:
 ; RV32ZBSZBB:       # %bb.0:
 ; RV32ZBSZBB-NEXT:    andi a3, a2, 63
+; RV32ZBSZBB-NEXT:    bset a2, zero, a2
 ; RV32ZBSZBB-NEXT:    bset a4, zero, a3
 ; RV32ZBSZBB-NEXT:    addi a3, a3, -32
 ; RV32ZBSZBB-NEXT:    slti a3, a3, 0
 ; RV32ZBSZBB-NEXT:    addi a5, a3, -1
-; RV32ZBSZBB-NEXT:    and a4, a5, a4
 ; RV32ZBSZBB-NEXT:    neg a3, a3
-; RV32ZBSZBB-NEXT:    bset a2, zero, a2
+; RV32ZBSZBB-NEXT:    and a4, a5, a4
 ; RV32ZBSZBB-NEXT:    and a2, a3, a2
 ; RV32ZBSZBB-NEXT:    andn a0, a0, a2
 ; RV32ZBSZBB-NEXT:    andn a1, a1, a4
@@ -187,24 +187,24 @@ define signext i64 @bset_i64_zero(i64 signext %a) nounwind {
 ; RV32I-LABEL: bset_i64_zero:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a1, a0, -32
+; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    slti a1, a1, 0
-; RV32I-NEXT:    neg a2, a1
-; RV32I-NEXT:    li a3, 1
-; RV32I-NEXT:    sll a3, a3, a0
-; RV32I-NEXT:    and a0, a2, a3
+; RV32I-NEXT:    sll a2, a2, a0
+; RV32I-NEXT:    neg a0, a1
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bset_i64_zero:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    addi a1, a0, -32
-; RV32ZBS-NEXT:    slti a1, a1, 0
-; RV32ZBS-NEXT:    neg a2, a1
-; RV32ZBS-NEXT:    bset a3, zero, a0
-; RV32ZBS-NEXT:    and a0, a2, a3
-; RV32ZBS-NEXT:    addi a1, a1, -1
-; RV32ZBS-NEXT:    and a1, a1, a3
+; RV32ZBS-NEXT:    bset a2, zero, a0
+; RV32ZBS-NEXT:    slti a0, a1, 0
+; RV32ZBS-NEXT:    neg a1, a0
+; RV32ZBS-NEXT:    addi a3, a0, -1
+; RV32ZBS-NEXT:    and a0, a1, a2
+; RV32ZBS-NEXT:    and a1, a3, a2
 ; RV32ZBS-NEXT:    ret
   %shl = shl i64 1, %a
   ret i64 %shl
diff --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
index 315bf86046dff5..dd49d9e3e2dce1 100644
--- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
@@ -73,13 +73,13 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    li a1, -449
 ; RV64I-NEXT:    slli a1, a1, 53
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixdfti
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    mv s3, a1
 ; RV64I-NEXT:    li s5, -1
-; RV64I-NEXT:    bgez s1, .LBB4_2
+; RV64I-NEXT:    bgez s2, .LBB4_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    slli s3, s5, 63
 ; RV64I-NEXT:  .LBB4_2:
@@ -97,14 +97,14 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __unorddf2
 ; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    slti a1, s2, 0
+; RV64I-NEXT:    sgtz a2, s4
 ; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addi a3, a1, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    slti a2, s1, 0
-; RV64I-NEXT:    addi a2, a2, -1
-; RV64I-NEXT:    and a2, a2, s2
-; RV64I-NEXT:    sgtz a3, s4
-; RV64I-NEXT:    neg a3, a3
-; RV64I-NEXT:    or a2, a3, a2
+; RV64I-NEXT:    and a3, a3, s1
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -140,11 +140,11 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64ID-NEXT:    srli a1, a2, 1
 ; RV64ID-NEXT:  .LBB4_4:
 ; RV64ID-NEXT:    feq.d a2, fs0, fs0
-; RV64ID-NEXT:    neg a2, a2
-; RV64ID-NEXT:    and a1, a2, a1
 ; RV64ID-NEXT:    neg a3, a3
 ; RV64ID-NEXT:    neg a4, s0
+; RV64ID-NEXT:    neg a2, a2
 ; RV64ID-NEXT:    and a0, a4, a0
+; RV64ID-NEXT:    and a1, a2, a1
 ; RV64ID-NEXT:    or a0, a3, a0
 ; RV64ID-NEXT:    and a0, a2, a0
 ; RV64ID-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -178,11 +178,11 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64IDINX-NEXT:    srli a1, a2, 1
 ; RV64IDINX-NEXT:  .LBB4_4:
 ; RV64IDINX-NEXT:    feq.d a2, s0, s0
-; RV64IDINX-NEXT:    neg a2, a2
-; RV64IDINX-NEXT:    and a1, a2, a1
 ; RV64IDINX-NEXT:    neg a3, a3
 ; RV64IDINX-NEXT:    neg a4, s1
+; RV64IDINX-NEXT:    neg a2, a2
 ; RV64IDINX-NEXT:    and a0, a4, a0
+; RV64IDINX-NEXT:    and a1, a2, a1
 ; RV64IDINX-NEXT:    or a0, a3, a0
 ; RV64IDINX-NEXT:    and a0, a2, a0
 ; RV64IDINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -219,10 +219,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __gtdf2
 ; RV64I-NEXT:    sgtz a0, a0
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    or a0, a1, s3
-; RV64I-NEXT:    and a2, s2, s1
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    and a1, s2, s1
+; RV64I-NEXT:    neg a2, a0
+; RV64I-NEXT:    or a0, a2, s3
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -245,10 +245,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64ID-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV64ID-NEXT:    fld fa5, %lo(.LCPI5_0)(a2)
 ; RV64ID-NEXT:    and a0, s0, a0
+; RV64ID-NEXT:    and a1, s0, a1
 ; RV64ID-NEXT:    flt.d a2, fa5, fs0
 ; RV64ID-NEXT:    neg a2, a2
 ; RV64ID-NEXT:    or a0, a2, a0
-; RV64ID-NEXT:    and a1, s0, a1
 ; RV64ID-NEXT:    or a1, a2, a1
 ; RV64ID-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -270,10 +270,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64IDINX-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV64IDINX-NEXT:    ld a2, %lo(.LCPI5_0)(a2)
 ; RV64IDINX-NEXT:    and a0, s1, a0
+; RV64IDINX-NEXT:    and a1, s1, a1
 ; RV64IDINX-NEXT:    flt.d a2, a2, s0
 ; RV64IDINX-NEXT:    neg a2, a2
 ; RV64IDINX-NEXT:    or a0, a2, a0
-; RV64IDINX-NEXT:    and a1, s1, a1
 ; RV64IDINX-NEXT:    or a1, a2, a1
 ; RV64IDINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IDINX-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
index 8ebb9433bad79a..0cdd92fbaf916b 100644
--- a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
@@ -133,14 +133,14 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __unordsf2
 ; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    slti a1, s1, 0
+; RV64I-NEXT:    sgtz a2, s4
 ; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addi a3, a1, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    slti a2, s1, 0
-; RV64I-NEXT:    addi a2, a2, -1
-; RV64I-NEXT:    and a2, a2, s2
-; RV64I-NEXT:    sgtz a3, s4
-; RV64I-NEXT:    neg a3, a3
-; RV64I-NEXT:    or a2, a3, a2
+; RV64I-NEXT:    and a3, a3, s2
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -176,11 +176,11 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64IF-NEXT:    srli a1, a3, 1
 ; RV64IF-NEXT:  .LBB4_4:
 ; RV64IF-NEXT:    feq.s a3, fs0, fs0
-; RV64IF-NEXT:    neg a3, a3
-; RV64IF-NEXT:    and a1, a3, a1
 ; RV64IF-NEXT:    neg a4, s0
-; RV64IF-NEXT:    and a0, a4, a0
 ; RV64IF-NEXT:    neg a2, a2
+; RV64IF-NEXT:    neg a3, a3
+; RV64IF-NEXT:    and a0, a4, a0
+; RV64IF-NEXT:    and a1, a3, a1
 ; RV64IF-NEXT:    or a0, a2, a0
 ; RV64IF-NEXT:    and a0, a3, a0
 ; RV64IF-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -213,11 +213,11 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64IZFINX-NEXT:    srli a1, a2, 1
 ; RV64IZFINX-NEXT:  .LBB4_4:
 ; RV64IZFINX-NEXT:    feq.s a2, s0, s0
-; RV64IZFINX-NEXT:    neg a2, a2
-; RV64IZFINX-NEXT:    and a1, a2, a1
 ; RV64IZFINX-NEXT:    neg a4, s1
-; RV64IZFINX-NEXT:    and a0, a4, a0
 ; RV64IZFINX-NEXT:    neg a3, a3
+; RV64IZFINX-NEXT:    neg a2, a2
+; RV64IZFINX-NEXT:    and a0, a4, a0
+; RV64IZFINX-NEXT:    and a1, a2, a1
 ; RV64IZFINX-NEXT:    or a0, a3, a0
 ; RV64IZFINX-NEXT:    and a0, a2, a0
 ; RV64IZFINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -252,8 +252,8 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64I-NEXT:    sext.w a0, s0
 ; RV64I-NEXT:    call __fixunssfti
 ; RV64I-NEXT:    and a0, s2, a0
-; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    and a1, s2, a1
+; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    or a1, s1, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -276,10 +276,10 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64IF-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV64IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a2)
 ; RV64IF-NEXT:    and a0, s0, a0
+; RV64IF-NEXT:    and a1, s0, a1
 ; RV64IF-NEXT:    flt.s a2, fa5, fs0
 ; RV64IF-NEXT:    neg a2, a2
 ; RV64IF-NEXT:    or a0, a2, a0
-; RV64IF-NEXT:    and a1, s0, a1
 ; RV64IF-NEXT:    or a1, a2, a1
 ; RV64IF-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IF-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -300,11 +300,11 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64IZFINX-NEXT:    call __fixunssfti
 ; RV64IZFINX-NEXT:    and a0, s1, a0
 ; RV64IZFINX-NEXT:    lui a2, 522240
+; RV64IZFINX-NEXT:    and a1, s1, a1
 ; RV64IZFINX-NEXT:    addiw a2, a2, -1
 ; RV64IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV64IZFINX-NEXT:    neg a2, a2
 ; RV64IZFINX-NEXT:    or a0, a2, a0
-; RV64IZFINX-NEXT:    and a1, s1, a1
 ; RV64IZFINX-NEXT:    or a1, a2, a1
 ; RV64IZFINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IZFINX-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
index 2cb2ecbd57f65c..a717c6c71f2ec0 100644
--- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
@@ -174,14 +174,14 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __unordsf2
 ; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    sgtz a1, s4
+; RV64I-NEXT:    slti a2, s0, 0
 ; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    neg a3, a1
+; RV64I-NEXT:    addi a2, a2, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    sgtz a2, s4
-; RV64I-NEXT:    neg a2, a2
-; RV64I-NEXT:    slti a3, s0, 0
-; RV64I-NEXT:    addi a3, a3, -1
-; RV64I-NEXT:    and a3, a3, s2
-; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    and a2, a2, s2
+; RV64I-NEXT:    or a2, a3, a2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -218,11 +218,11 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZFH-NEXT:    srli a1, a2, 1
 ; RV64IZFH-NEXT:  .LBB4_4:
 ; RV64IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV64IZFH-NEXT:    neg a2, a2
-; RV64IZFH-NEXT:    and a1, a2, a1
 ; RV64IZFH-NEXT:    neg a3, a3
 ; RV64IZFH-NEXT:    neg a4, s0
+; RV64IZFH-NEXT:    neg a2, a2
 ; RV64IZFH-NEXT:    and a0, a4, a0
+; RV64IZFH-NEXT:    and a1, a2, a1
 ; RV64IZFH-NEXT:    or a0, a3, a0
 ; RV64IZFH-NEXT:    and a0, a2, a0
 ; RV64IZFH-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -255,11 +255,11 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZHINX-NEXT:    srli a1, a2, 1
 ; RV64IZHINX-NEXT:  .LBB4_4:
 ; RV64IZHINX-NEXT:    feq.s a2, s0, s0
-; RV64IZHINX-NEXT:    neg a2, a2
-; RV64IZHINX-NEXT:    and a1, a2, a1
 ; RV64IZHINX-NEXT:    neg a3, a3
 ; RV64IZHINX-NEXT:    neg a4, s1
+; RV64IZHINX-NEXT:    neg a2, a2
 ; RV64IZHINX-NEXT:    and a0, a4, a0
+; RV64IZHINX-NEXT:    and a1, a2, a1
 ; RV64IZHINX-NEXT:    or a0, a3, a0
 ; RV64IZHINX-NEXT:    and a0, a2, a0
 ; RV64IZHINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -295,8 +295,8 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64I-NEXT:    sext.w a0, s0
 ; RV64I-NEXT:    call __fixunssfti
 ; RV64I-NEXT:    and a0, s2, a0
-; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    and a1, s2, a1
+; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    or a1, s1, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -314,15 +314,15 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZFH-NEXT:    lui a0, %hi(.LCPI5_0)
 ; RV64IZFH-NEXT:    flw fa5, %lo(.LCPI5_0)(a0)
 ; RV64IZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV64IZFH-NEXT:    flt.s a0, fa5, fa0
-; RV64IZFH-NEXT:    neg s0, a0
-; RV64IZFH-NEXT:    fmv.w.x fa5, zero
-; RV64IZFH-NEXT:    fle.s a0, fa5, fa0
+; RV64IZFH-NEXT:    fmv.w.x fa4, zero
+; RV64IZFH-NEXT:    fle.s a0, fa4, fa0
+; RV64IZFH-NEXT:    flt.s a1, fa5, fa0
+; RV64IZFH-NEXT:    neg s0, a1
 ; RV64IZFH-NEXT:    neg s1, a0
 ; RV64IZFH-NEXT:    call __fixunssfti
 ; RV64IZFH-NEXT:    and a0, s1, a0
-; RV64IZFH-NEXT:    or a0, s0, a0
 ; RV64IZFH-NEXT:    and a1, s1, a1
+; RV64IZFH-NEXT:    or a0, s0, a0
 ; RV64IZFH-NEXT:    or a1, s0, a1
 ; RV64IZFH-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IZFH-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -339,14 +339,14 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    lui a1, 522240
 ; RV64IZHINX-NEXT:    addiw a1, a1, -1
+; RV64IZHINX-NEXT:    fle.s a2, zero, a0
 ; RV64IZHINX-NEXT:    flt.s a1, a1, a0
 ; RV64IZHINX-NEXT:    neg s0, a1
-; RV64IZHINX-NEXT:    fle.s a1, zero, a0
-; RV64IZHINX-NEXT:    neg s1, a1
+; RV64IZHINX-NEXT:    neg s1, a2
 ; RV64IZHINX-NEXT:    call __fixunssfti
 ; RV64IZHINX-NEXT:    and a0, s1, a0
-; RV64IZHINX-NEXT:    or a0, s0, a0
 ; RV64IZHINX-NEXT:    and a1, s1, a1
+; RV64IZHINX-NEXT:    or a0, s0, a0
 ; RV64IZHINX-NEXT:    or a1, s0, a1
 ; RV64IZHINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IZHINX-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index ba184063265098..1ec4d8ddd1d84e 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -18,15 +18,15 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lui a0, %hi(f)
 ; RV64-NEXT:    addi a0, a0, %lo(f)
+; RV64-NEXT:    li a2, 919
+; RV64-NEXT:    lui a3, %hi(.LCPI0_0)
 ; RV64-NEXT:    sd a0, 32(sp)
-; RV64-NEXT:    li a0, 919
-; RV64-NEXT:    lui a2, %hi(.LCPI0_0)
-; RV64-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
-; RV64-NEXT:    lui a3, 6203
-; RV64-NEXT:    addi a3, a3, 643
-; RV64-NEXT:    sw a0, 8(sp)
-; RV64-NEXT:    sw a3, 12(sp)
-; RV64-NEXT:    sd a2, 16(sp)
+; RV64-NEXT:    lui a0, 6203
+; RV64-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
+; RV64-NEXT:    addi a0, a0, 643
+; RV64-NEXT:    sw a2, 8(sp)
+; RV64-NEXT:    sw a0, 12(sp)
+; RV64-NEXT:    sd a3, 16(sp)
 ; RV64-NEXT:    sd a1, 24(sp)
 ; RV64-NEXT:    addi a1, sp, 24
 ; RV64-NEXT:    addi a0, sp, 8
@@ -49,15 +49,15 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
 ; RV64-LINUX-NEXT:    mv s0, a0
 ; RV64-LINUX-NEXT:    lui a0, %hi(f)
 ; RV64-LINUX-NEXT:    addi a0, a0, %lo(f)
+; RV64-LINUX-NEXT:    li a2, 919
+; RV64-LINUX-NEXT:    lui a3, %hi(.LCPI0_0)
 ; RV64-LINUX-NEXT:    sd a0, 32(sp)
-; RV64-LINUX-NEXT:    li a0, 919
-; RV64-LINUX-NEXT:    lui a2, %hi(.LCPI0_0)
-; RV64-LINUX-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
-; RV64-LINUX-NEXT:    lui a3, 6203
-; RV64-LINUX-NEXT:    addi a3, a3, 643
-; RV64-LINUX-NEXT:    sw a0, 8(sp)
-; RV64-LINUX-NEXT:    sw a3, 12(sp)
-; RV64-LINUX-NEXT:    sd a2, 16(sp)
+; RV64-LINUX-NEXT:    lui a0, 6203
+; RV64-LINUX-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
+; RV64-LINUX-NEXT:    addi a0, a0, 643
+; RV64-LINUX-NEXT:    sw a2, 8(sp)
+; RV64-LINUX-NEXT:    sw a0, 12(sp)
+; RV64-LINUX-NEXT:    sd a3, 16(sp)
 ; RV64-LINUX-NEXT:    sd a1, 24(sp)
 ; RV64-LINUX-NEXT:    addi a1, sp, 24
 ; RV64-LINUX-NEXT:    addi a0, sp, 8
diff --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index 6c4466796aeedd..b3c22a5322cb4c 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -175,10 +175,10 @@ define i8 @test13(ptr %0, i64 %1) {
 ; RV64I-LABEL: test13:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a2, 1
-; RV64I-NEXT:    subw a2, a2, a1
-; RV64I-NEXT:    add a2, a0, a2
 ; RV64I-NEXT:    li a3, 2
+; RV64I-NEXT:    subw a2, a2, a1
 ; RV64I-NEXT:    subw a3, a3, a1
+; RV64I-NEXT:    add a2, a0, a2
 ; RV64I-NEXT:    add a0, a0, a3
 ; RV64I-NEXT:    lbu a1, 0(a2)
 ; RV64I-NEXT:    lbu a0, 0(a0)
@@ -203,8 +203,8 @@ define signext i32 @test14(ptr %0, ptr %1, i64 %2) {
 ; RV64I-NEXT:    li a3, 1
 ; RV64I-NEXT:    subw a3, a3, a2
 ; RV64I-NEXT:    add a0, a0, a3
-; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    slli a3, a3, 2
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    lw a1, 0(a1)
 ; RV64I-NEXT:    addw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
index ab1691543c78af..0782018833de30 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
@@ -7,11 +7,11 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    addi a3, a0, 1
-; CHECK-NEXT:    mul a3, a2, a3
+; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    subw a1, a1, a0
 ; CHECK-NEXT:    addi a1, a1, -2
+; CHECK-NEXT:    mul a3, a2, a3
 ; CHECK-NEXT:    slli a1, a1, 32
 ; CHECK-NEXT:    slli a2, a2, 32
 ; CHECK-NEXT:    mulhu a1, a2, a1
@@ -53,13 +53,13 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    mul a2, a3, a2
-; CHECK-NEXT:    subw a1, a1, a0
-; CHECK-NEXT:    addi a1, a1, -2
-; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    subw a3, a1, a0
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    addi a3, a3, -2
+; CHECK-NEXT:    mul a2, a1, a2
 ; CHECK-NEXT:    slli a3, a3, 32
-; CHECK-NEXT:    mulhu a1, a3, a1
+; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    mulhu a1, a1, a3
 ; CHECK-NEXT:    srli a1, a1, 1
 ; CHECK-NEXT:    subw a0, a2, a0
 ; CHECK-NEXT:    subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 47c4e8beecced0..d9f7d361272934 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -12,31 +12,31 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -64,31 +64,31 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -125,31 +125,31 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a2, a1, 1
+; RV64I-NEXT:    lui a3, 349525
 ; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    addi a2, a3, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
@@ -179,39 +179,39 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: findLastSet_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 16
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    srliw a1, a1, 24
 ; RV64I-NEXT:    xori a1, a1, 31
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -219,10 +219,10 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB-LABEL: findLastSet_i32:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    not a1, a0
+; RV64XTHEADBB-NEXT:    snez a0, a0
 ; RV64XTHEADBB-NEXT:    slli a1, a1, 32
 ; RV64XTHEADBB-NEXT:    th.ff0 a1, a1
 ; RV64XTHEADBB-NEXT:    xori a1, a1, 31
-; RV64XTHEADBB-NEXT:    snez a0, a0
 ; RV64XTHEADBB-NEXT:    addi a0, a0, -1
 ; RV64XTHEADBB-NEXT:    or a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -240,31 +240,31 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -296,40 +296,40 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -456,10 +456,10 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB-NEXT:    addi a1, a0, -1
 ; RV64XTHEADBB-NEXT:    not a2, a0
 ; RV64XTHEADBB-NEXT:    and a1, a2, a1
-; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
 ; RV64XTHEADBB-NEXT:    li a2, 64
-; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    snez a0, a0
+; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
+; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    addi a0, a0, -1
 ; RV64XTHEADBB-NEXT:    or a0, a0, a2
 ; RV64XTHEADBB-NEXT:    ret
@@ -486,8 +486,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI9_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    seqz a1, s0
+; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    addi a1, a1, -1
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -500,10 +500,10 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB-NEXT:    addi a1, a0, -1
 ; RV64XTHEADBB-NEXT:    not a2, a0
 ; RV64XTHEADBB-NEXT:    and a1, a2, a1
-; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
 ; RV64XTHEADBB-NEXT:    li a2, 65
-; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    seqz a0, a0
+; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
+; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    addi a0, a0, -1
 ; RV64XTHEADBB-NEXT:    and a0, a0, a2
 ; RV64XTHEADBB-NEXT:    ret
@@ -802,11 +802,11 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
@@ -827,11 +827,11 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    srliw a4, a0, 24
 ; RV64I-NEXT:    addi a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    srliw a4, a0, 24
-; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
@@ -856,28 +856,28 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a3, a3, 24
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    and a5, a0, a5
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    or a1, a3, a1
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 07726b643b51ad..9760821832b375 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -2656,8 +2656,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ; RV64I-LABEL: array_index_lshr_sh3_sh3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 58
-; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ld a0, 0(a0)
@@ -2759,8 +2759,8 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
 ; RV64I-LABEL: test_gep_gep_dont_crash:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a2, a2, 6
-; RV64I-NEXT:    slli a2, a2, 3
 ; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    slli a2, a2, 3
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
index a7af8ab348e99e..3f984deccfb2c7 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
@@ -59,12 +59,12 @@ define i64 @orcb64_knownbits(i64 %a) nounwind {
 ; RV64ZBB-LABEL: orcb64_knownbits:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    lui a1, 65535
+; RV64ZBB-NEXT:    lui a2, 256
 ; RV64ZBB-NEXT:    slli a1, a1, 12
+; RV64ZBB-NEXT:    addiw a2, a2, 8
 ; RV64ZBB-NEXT:    and a0, a0, a1
-; RV64ZBB-NEXT:    lui a1, 256
-; RV64ZBB-NEXT:    addiw a1, a1, 8
-; RV64ZBB-NEXT:    slli a2, a1, 42
-; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    slli a1, a2, 42
+; RV64ZBB-NEXT:    add a1, a2, a1
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    orc.b a0, a0
 ; RV64ZBB-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
index d9afb7c00ce58f..bf077364c9c7ab 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
@@ -146,10 +146,10 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: rol_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    sllw a2, a1, a0
-; RV64I-NEXT:    negw a0, a0
-; RV64I-NEXT:    srlw a0, a1, a0
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    srlw a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: rol_i32_neg_constant_rhs:
@@ -224,10 +224,10 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: ror_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    srlw a2, a1, a0
-; RV64I-NEXT:    negw a0, a0
-; RV64I-NEXT:    sllw a0, a1, a0
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    srlw a0, a1, a0
+; RV64I-NEXT:    sllw a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: ror_i32_neg_constant_rhs:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 1e7814d588e4c0..d67db77c04a8ea 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -12,31 +12,31 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -62,31 +62,31 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -121,31 +121,31 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a2, a1, 1
+; RV64I-NEXT:    lui a3, 349525
 ; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    addi a2, a3, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
@@ -173,39 +173,39 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: findLastSet_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 16
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    srliw a1, a1, 24
 ; RV64I-NEXT:    xori a1, a1, 31
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -213,8 +213,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64ZBB-LABEL: findLastSet_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    clzw a1, a0
-; RV64ZBB-NEXT:    xori a1, a1, 31
 ; RV64ZBB-NEXT:    snez a0, a0
+; RV64ZBB-NEXT:    xori a1, a1, 31
 ; RV64ZBB-NEXT:    addi a0, a0, -1
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -232,31 +232,31 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -286,40 +286,40 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -456,8 +456,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI9_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    seqz a1, s0
+; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    addi a1, a1, -1
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -468,8 +468,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64ZBB-LABEL: ffs_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    ctzw a1, a0
-; RV64ZBB-NEXT:    addi a1, a1, 1
 ; RV64ZBB-NEXT:    seqz a0, a0
+; RV64ZBB-NEXT:    addi a1, a1, 1
 ; RV64ZBB-NEXT:    addi a0, a0, -1
 ; RV64ZBB-NEXT:    and a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -523,17 +523,17 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -630,21 +630,21 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-LABEL: ctpop_i32_load:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -670,39 +670,39 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    lui a4, 209715
+; RV64I-NEXT:    srli a5, a1, 1
 ; RV64I-NEXT:    addiw a3, a3, 1365
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    sub a0, a0, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a4, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    srli a4, a0, 4
-; RV64I-NEXT:    add a0, a0, a4
-; RV64I-NEXT:    lui a4, 61681
-; RV64I-NEXT:    addi a4, a4, -241
-; RV64I-NEXT:    and a0, a0, a4
-; RV64I-NEXT:    slli a5, a0, 8
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    slli a5, a0, 16
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    srli a5, a1, 1
 ; RV64I-NEXT:    and a3, a5, a3
+; RV64I-NEXT:    lui a5, 61681
+; RV64I-NEXT:    addiw a4, a4, 819
+; RV64I-NEXT:    addi a5, a5, -241
+; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a0, a4
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a3, a1, a4
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
-; RV64I-NEXT:    srli a2, a1, 4
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a4
 ; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    slli a2, a1, 8
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    slli a2, a1, 16
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    srli a3, a1, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    and a0, a0, a5
+; RV64I-NEXT:    and a1, a1, a5
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a3, a1, 8
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    slli a3, a1, 16
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    srliw a1, a1, 24
 ; RV64I-NEXT:    ret
 ;
@@ -720,11 +720,11 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -745,11 +745,11 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -758,8 +758,8 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV64ZBB-NEXT:    cpopw a1, a1
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    sltiu a1, a1, 2
+; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    xori a1, a1, 1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -785,8 +785,8 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
 ; RV64ZBB-NEXT:    cpopw a1, a1
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    seqz a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -801,11 +801,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV64I-NEXT:    xor a0, a0, a2
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    addiw a2, a1, -1
 ; RV64I-NEXT:    xor a1, a1, a2
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    xori a1, a1, 1
 ; RV64I-NEXT:    ret
 ;
@@ -814,8 +814,8 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV64ZBB-NEXT:    cpopw a1, a1
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    snez a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -828,28 +828,28 @@ declare i64 @llvm.ctpop.i64(i64)
 define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I-LABEL: ctpop_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -950,49 +950,49 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    lui a4, 209715
+; RV64I-NEXT:    lui a5, 61681
 ; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    slli a4, a3, 32
-; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    addiw a4, a4, 819
+; RV64I-NEXT:    addiw a5, a5, -241
+; RV64I-NEXT:    slli a6, a3, 32
+; RV64I-NEXT:    add a3, a3, a6
+; RV64I-NEXT:    slli a6, a4, 32
+; RV64I-NEXT:    add a4, a4, a6
+; RV64I-NEXT:    slli a6, a5, 32
+; RV64I-NEXT:    add a5, a5, a6
+; RV64I-NEXT:    srli a6, a1, 1
 ; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    and a3, a6, a3
 ; RV64I-NEXT:    sub a0, a0, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a4, a2, 32
-; RV64I-NEXT:    add a2, a2, a4
-; RV64I-NEXT:    and a4, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    srli a4, a0, 4
-; RV64I-NEXT:    add a0, a0, a4
-; RV64I-NEXT:    lui a4, 61681
-; RV64I-NEXT:    addiw a4, a4, -241
-; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    add a4, a4, a5
-; RV64I-NEXT:    and a0, a0, a4
-; RV64I-NEXT:    slli a5, a0, 8
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    slli a5, a0, 16
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    slli a5, a0, 32
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    srli a5, a1, 1
-; RV64I-NEXT:    and a3, a5, a3
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a0, a4
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a3, a1, a4
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
-; RV64I-NEXT:    srli a2, a1, 4
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a4
 ; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    slli a2, a1, 8
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    slli a2, a1, 16
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    srli a3, a1, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    and a0, a0, a5
+; RV64I-NEXT:    and a1, a1, a5
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a3, a1, 8
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    slli a3, a1, 16
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    slli a2, a0, 32
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    srli a1, a1, 56
 ; RV64I-NEXT:    ret
 ;
@@ -1010,9 +1010,9 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -1033,9 +1033,9 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -1044,8 +1044,8 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV64ZBB-NEXT:    cpop a1, a1
 ; RV64ZBB-NEXT:    cpop a0, a0
 ; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    sltiu a1, a1, 2
+; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    xori a1, a1, 1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1069,8 +1069,8 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV64ZBB-NEXT:    cpop a1, a1
 ; RV64ZBB-NEXT:    cpop a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    seqz a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1084,10 +1084,10 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    xor a0, a0, a2
 ; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    xor a1, a1, a2
 ; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    xori a1, a1, 1
 ; RV64I-NEXT:    ret
 ;
@@ -1096,8 +1096,8 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV64ZBB-NEXT:    cpop a1, a1
 ; RV64ZBB-NEXT:    cpop a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    snez a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1406,11 +1406,11 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
@@ -1432,11 +1432,11 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    srliw a4, a0, 24
 ; RV64I-NEXT:    addi a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    srliw a4, a0, 24
-; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
@@ -1462,28 +1462,28 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a3, a3, 24
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    and a5, a0, a5
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    or a1, a3, a1
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 4aa6cd42ab0994..985837d05caa20 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -275,8 +275,8 @@ define i64 @pack_i64_allWUsers(i32 signext %0, i32 signext %1, i32 signext %2) {
 ; RV64I-LABEL: pack_i64_allWUsers:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a2, a2, 32
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
index da477aa2043cf1..a6ef184abe5e19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
@@ -25,11 +25,9 @@ define void @foo(<vscale x 8 x i8> %0) {
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v9, v10, 0
+; CHECK-NEXT:    vslideup.vi v8, v10, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.x.s s1, v9
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 0
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.x.s s2, v8
 ; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mv a0, s1
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 74693e655bf037..163d9145bc3623 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -575,12 +575,12 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v16, v8, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 08b310213d16e1..4ade6c09fe43de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -125,30 +125,30 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_0)
 ; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsext.vf8 v24, v16
-; CHECK-NEXT:    vsaddu.vx v16, v24, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vle8.v v17, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vle8.v v18, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v16
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v17
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v17, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 2
+; CHECK-NEXT:    vslideup.vi v0, v16, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v10, 4
+; CHECK-NEXT:    vslideup.vi v0, v17, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vsext.vf8 v8, v18
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v16, 6
@@ -163,65 +163,60 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vle8.v v17, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vle8.v v18, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
-; CHECK-NEXT:    vle8.v v11, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v11
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v11, v16, a2
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vle8.v v19, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
-; CHECK-NEXT:    vle8.v v12, (a0)
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
+; CHECK-NEXT:    vle8.v v20, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
-; CHECK-NEXT:    vle8.v v13, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v12
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v12, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v13
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v13, v16, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v9, 4
+; CHECK-NEXT:    vle8.v v21, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 6
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vle8.v v22, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v16
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v17
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v17, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v18
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v18, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v19
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v19, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v20
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v20, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v21
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v21, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v22
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v22, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v12, 2
+; CHECK-NEXT:    vslideup.vi v17, v16, 2
+; CHECK-NEXT:    vslideup.vi v0, v20, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v13, 4
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
+; CHECK-NEXT:    vslideup.vi v17, v18, 4
+; CHECK-NEXT:    vslideup.vi v0, v21, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 6
+; CHECK-NEXT:    vslideup.vi v17, v19, 6
+; CHECK-NEXT:    vslideup.vi v0, v22, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v10, 8
+; CHECK-NEXT:    vslideup.vi v0, v17, 8
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index abe1920e437842..9ac2775d306682 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -16,18 +16,18 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
-; CHECK-NEXT:    add a2, a0, a1
-; CHECK-NEXT:    vl1re64.v v8, (a2)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
 ; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    vl1re64.v v10, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v9, (a0)
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v10, (a2)
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v9, (a3)
+; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re64.v v10, (a0)
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vs1r.v v8, (a3)
+; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    vs1r.v v10, (a1)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
index 97903396679158..fb25d4e15e40e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
@@ -20,8 +20,8 @@ define <vscale x 1 x double> @test(ptr %addr, i64 %vl) {
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    vs1r.v v9, (a2)
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
index 4cd1b045529e3f..853f937bbd2301 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
@@ -14,8 +14,8 @@ define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @load_store_m1x5(targe
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vs1r.v v9, (a2)
 ; CHECK-NEXT:    add a3, a2, a1
@@ -57,8 +57,8 @@ define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @load_store_m2x2(targ
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vs2r.v v10, (a1)
@@ -92,8 +92,8 @@ define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @load_store_m4x2(targ
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vs4r.v v12, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
index 43be8feece23c1..7fe6bd24a2552d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
@@ -774,8 +774,8 @@ define void @lmul_16_align() nounwind {
 ; NOZBA-NEXT:    csrr a0, vlenb
 ; NOZBA-NEXT:    add a0, sp, a0
 ; NOZBA-NEXT:    addi a0, a0, 128
-; NOZBA-NEXT:    vs8r.v v8, (a0)
 ; NOZBA-NEXT:    csrr a1, vlenb
+; NOZBA-NEXT:    vs8r.v v8, (a0)
 ; NOZBA-NEXT:    slli a1, a1, 3
 ; NOZBA-NEXT:    add a0, a0, a1
 ; NOZBA-NEXT:    vs8r.v v8, (a0)
@@ -805,8 +805,8 @@ define void @lmul_16_align() nounwind {
 ; ZBA-NEXT:    csrr a0, vlenb
 ; ZBA-NEXT:    add a0, sp, a0
 ; ZBA-NEXT:    addi a0, a0, 128
-; ZBA-NEXT:    vs8r.v v8, (a0)
 ; ZBA-NEXT:    csrr a1, vlenb
+; ZBA-NEXT:    vs8r.v v8, (a0)
 ; ZBA-NEXT:    sh3add a0, a1, a0
 ; ZBA-NEXT:    vs8r.v v8, (a0)
 ; ZBA-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
@@ -837,8 +837,8 @@ define void @lmul_16_align() nounwind {
 ; NOMUL-NEXT:    csrr a0, vlenb
 ; NOMUL-NEXT:    add a0, sp, a0
 ; NOMUL-NEXT:    addi a0, a0, 128
-; NOMUL-NEXT:    vs8r.v v8, (a0)
 ; NOMUL-NEXT:    csrr a1, vlenb
+; NOMUL-NEXT:    vs8r.v v8, (a0)
 ; NOMUL-NEXT:    slli a1, a1, 3
 ; NOMUL-NEXT:    add a0, a0, a1
 ; NOMUL-NEXT:    vs8r.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 29d19ed38bbeda..1ed84316d4484c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -10,17 +10,17 @@ define <vscale x 1 x i8> @bitreverse_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -41,17 +41,17 @@ define <vscale x 2 x i8> @bitreverse_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -72,17 +72,17 @@ define <vscale x 4 x i8> @bitreverse_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -103,17 +103,17 @@ define <vscale x 8 x i8> @bitreverse_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -134,17 +134,17 @@ define <vscale x 16 x i8> @bitreverse_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vsll.vi v10, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -165,17 +165,17 @@ define <vscale x 32 x i8> @bitreverse_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsll.vi v12, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -196,17 +196,17 @@ define <vscale x 64 x i8> @bitreverse_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsll.vi v16, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -227,26 +227,26 @@ define <vscale x 1 x i16> @bitreverse_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -267,26 +267,26 @@ define <vscale x 2 x i16> @bitreverse_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -307,26 +307,26 @@ define <vscale x 4 x i16> @bitreverse_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -347,26 +347,26 @@ define <vscale x 8 x i16> @bitreverse_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -387,26 +387,26 @@ define <vscale x 16 x i16> @bitreverse_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -427,26 +427,26 @@ define <vscale x 32 x i16> @bitreverse_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -467,34 +467,34 @@ define <vscale x 1 x i32> @bitreverse_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -515,34 +515,34 @@ define <vscale x 2 x i32> @bitreverse_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -563,34 +563,34 @@ define <vscale x 4 x i32> @bitreverse_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vsll.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -611,34 +611,34 @@ define <vscale x 8 x i32> @bitreverse_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vsll.vi v16, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -659,34 +659,34 @@ define <vscale x 16 x i32> @bitreverse_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
-; CHECK-NEXT:    vand.vx v24, v8, a0
-; CHECK-NEXT:    vsll.vi v24, v24, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vsll.vi v24, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -707,65 +707,65 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v10, a2
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v10, v10, a3
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vand.vx v11, v11, a0
+; RV32-NEXT:    vlse64.v v13, (a5), zero
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a0
+; RV32-NEXT:    vsll.vx v11, v11, a2
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a0
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    vsll.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vand.vx v9, v9, a4
+; RV32-NEXT:    vand.vv v12, v12, v13
+; RV32-NEXT:    vor.vv v9, v12, v9
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    vand.vv v12, v8, v13
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -774,60 +774,60 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vx v9, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v10, v10, a2
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    vsrl.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vx v11, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v11, v11, a2
+; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v9, v9, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vor.vv v9, v11, v9
+; RV64-NEXT:    vand.vx v11, v8, a3
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v9, v9, v10
+; RV64-NEXT:    vand.vx v10, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v11, v11, 24
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vand.vx v10, v8, a3
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 8
-; RV64-NEXT:    vor.vv v10, v10, v11
-; RV64-NEXT:    vsll.vx v11, v8, a0
+; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v11, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
@@ -848,65 +848,65 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v12, v12, a3
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a0
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    vsll.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a0
+; RV32-NEXT:    vlse64.v v14, (a5), zero
+; RV32-NEXT:    vor.vv v12, v12, v10
+; RV32-NEXT:    vand.vx v10, v8, a0
+; RV32-NEXT:    vsll.vx v10, v10, a2
+; RV32-NEXT:    vor.vv v10, v18, v10
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    vand.vv v18, v18, v14
+; RV32-NEXT:    vor.vv v16, v18, v16
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    vand.vv v14, v8, v14
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v12, v16, v12
+; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsll.vi v14, v14, 8
+; RV32-NEXT:    vor.vv v8, v8, v14
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a2
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v12, v12, v14
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v12, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -915,60 +915,60 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vx v10, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v12, v12, a2
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v12, v12, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a1
+; RV64-NEXT:    vsrl.vx v14, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v14, v14, a2
+; RV64-NEXT:    vor.vv v12, v14, v12
 ; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v14, v14, a4
+; RV64-NEXT:    vor.vv v10, v14, v10
+; RV64-NEXT:    vand.vx v14, v8, a3
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v10, v10, v12
+; RV64-NEXT:    vand.vx v12, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v14, v14, 24
+; RV64-NEXT:    vsll.vi v12, v12, 8
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vand.vx v12, v8, a3
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 8
-; RV64-NEXT:    vor.vv v12, v12, v14
-; RV64-NEXT:    vsll.vx v14, v8, a0
+; RV64-NEXT:    vsll.vx v14, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v14, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
@@ -989,65 +989,65 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v20, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v20
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsll.vx v16, v8, a0
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    vsll.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vand.vx v24, v8, a3
-; RV32-NEXT:    vsll.vi v24, v24, 24
-; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vlse64.v v20, (a5), zero
+; RV32-NEXT:    vor.vv v16, v16, v12
+; RV32-NEXT:    vand.vx v12, v8, a0
+; RV32-NEXT:    vsll.vx v12, v12, a2
+; RV32-NEXT:    vor.vv v12, v28, v12
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v24, v24, a4
+; RV32-NEXT:    vand.vv v28, v28, v20
+; RV32-NEXT:    vor.vv v24, v28, v24
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    vand.vv v20, v8, v20
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vsetvli a3, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vsll.vi v20, v20, 8
+; RV32-NEXT:    vor.vv v8, v8, v20
+; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a2
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v16, v16, v20
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v16, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1056,60 +1056,60 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vx v12, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v16, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a1
+; RV64-NEXT:    vsrl.vx v20, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v20, v20, a2
+; RV64-NEXT:    vor.vv v12, v20, v12
 ; RV64-NEXT:    vsrl.vi v20, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vor.vv v20, v20, v16
 ; RV64-NEXT:    vand.vx v16, v8, a3
-; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v12, v20, v12
 ; RV64-NEXT:    vand.vx v20, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v16, v16, 24
 ; RV64-NEXT:    vsll.vi v20, v20, 8
 ; RV64-NEXT:    vor.vv v16, v16, v20
-; RV64-NEXT:    vsll.vx v20, v8, a0
+; RV64-NEXT:    vsll.vx v20, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v20, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
@@ -1130,80 +1130,87 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v24, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v0, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vx v16, v16, a2
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v0, (a5), zero
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a4
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a2
-; RV32-NEXT:    vsll.vx v0, v0, a1
-; RV32-NEXT:    vsll.vx v24, v8, a0
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1212,60 +1219,60 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v16, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v24, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a1
+; RV64-NEXT:    vsrl.vx v0, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v0, v0, a2
+; RV64-NEXT:    vor.vv v16, v0, v16
 ; RV64-NEXT:    vsrl.vi v0, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v0, v0, v24
 ; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v16, v0, v16
 ; RV64-NEXT:    vand.vx v0, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v24, v24, 24
 ; RV64-NEXT:    vsll.vi v0, v0, 8
 ; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vsll.vx v0, v8, a0
+; RV64-NEXT:    vsll.vx v0, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v0, v8
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 8abe35bf1d97ec..66a1178cddb66c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -15,18 +15,18 @@ define <vscale x 1 x i8> @vp_bitreverse_nxv1i8(<vscale x 1 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -47,20 +47,20 @@ define <vscale x 1 x i8> @vp_bitreverse_nxv1i8_unmasked(<vscale x 1 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -81,18 +81,18 @@ define <vscale x 2 x i8> @vp_bitreverse_nxv2i8(<vscale x 2 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -113,20 +113,20 @@ define <vscale x 2 x i8> @vp_bitreverse_nxv2i8_unmasked(<vscale x 2 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -147,18 +147,18 @@ define <vscale x 4 x i8> @vp_bitreverse_nxv4i8(<vscale x 4 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -179,20 +179,20 @@ define <vscale x 4 x i8> @vp_bitreverse_nxv4i8_unmasked(<vscale x 4 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -213,18 +213,18 @@ define <vscale x 8 x i8> @vp_bitreverse_nxv8i8(<vscale x 8 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -245,20 +245,20 @@ define <vscale x 8 x i8> @vp_bitreverse_nxv8i8_unmasked(<vscale x 8 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -279,18 +279,18 @@ define <vscale x 16 x i8> @vp_bitreverse_nxv16i8(<vscale x 16 x i8> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v10, v10, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -311,20 +311,20 @@ define <vscale x 16 x i8> @vp_bitreverse_nxv16i8_unmasked(<vscale x 16 x i8> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v8, 15
-; CHECK-NEXT:    vsll.vi v10, v10, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v10, v10, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -345,18 +345,18 @@ define <vscale x 32 x i8> @vp_bitreverse_nxv32i8(<vscale x 32 x i8> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v12, v12, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -377,20 +377,20 @@ define <vscale x 32 x i8> @vp_bitreverse_nxv32i8_unmasked(<vscale x 32 x i8> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v8, 15
-; CHECK-NEXT:    vsll.vi v12, v12, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v12, v12, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -411,18 +411,18 @@ define <vscale x 64 x i8> @vp_bitreverse_nxv64i8(<vscale x 64 x i8> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -443,20 +443,20 @@ define <vscale x 64 x i8> @vp_bitreverse_nxv64i8_unmasked(<vscale x 64 x i8> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v8, 15
-; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -477,25 +477,25 @@ define <vscale x 1 x i16> @vp_bitreverse_nxv1i16(<vscale x 1 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -517,26 +517,26 @@ define <vscale x 1 x i16> @vp_bitreverse_nxv1i16_unmasked(<vscale x 1 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -557,25 +557,25 @@ define <vscale x 2 x i16> @vp_bitreverse_nxv2i16(<vscale x 2 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -597,26 +597,26 @@ define <vscale x 2 x i16> @vp_bitreverse_nxv2i16_unmasked(<vscale x 2 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -637,25 +637,25 @@ define <vscale x 4 x i16> @vp_bitreverse_nxv4i16(<vscale x 4 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -677,26 +677,26 @@ define <vscale x 4 x i16> @vp_bitreverse_nxv4i16_unmasked(<vscale x 4 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -717,25 +717,25 @@ define <vscale x 8 x i16> @vp_bitreverse_nxv8i16(<vscale x 8 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -757,26 +757,26 @@ define <vscale x 8 x i16> @vp_bitreverse_nxv8i16_unmasked(<vscale x 8 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -797,25 +797,25 @@ define <vscale x 16 x i16> @vp_bitreverse_nxv16i16(<vscale x 16 x i16> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -837,26 +837,26 @@ define <vscale x 16 x i16> @vp_bitreverse_nxv16i16_unmasked(<vscale x 16 x i16>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -877,25 +877,25 @@ define <vscale x 32 x i16> @vp_bitreverse_nxv32i16(<vscale x 32 x i16> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -917,26 +917,26 @@ define <vscale x 32 x i16> @vp_bitreverse_nxv32i16_unmasked(<vscale x 32 x i16>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -963,27 +963,27 @@ define <vscale x 1 x i32> @vp_bitreverse_nxv1i32(<vscale x 1 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1005,34 +1005,34 @@ define <vscale x 1 x i32> @vp_bitreverse_nxv1i32_unmasked(<vscale x 1 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -1059,27 +1059,27 @@ define <vscale x 2 x i32> @vp_bitreverse_nxv2i32(<vscale x 2 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1101,34 +1101,34 @@ define <vscale x 2 x i32> @vp_bitreverse_nxv2i32_unmasked(<vscale x 2 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -1155,27 +1155,27 @@ define <vscale x 4 x i32> @vp_bitreverse_nxv4i32(<vscale x 4 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1197,34 +1197,34 @@ define <vscale x 4 x i32> @vp_bitreverse_nxv4i32_unmasked(<vscale x 4 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vsll.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -1251,27 +1251,27 @@ define <vscale x 8 x i32> @vp_bitreverse_nxv8i32(<vscale x 8 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1293,34 +1293,34 @@ define <vscale x 8 x i32> @vp_bitreverse_nxv8i32_unmasked(<vscale x 8 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vsll.vi v16, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -1347,27 +1347,27 @@ define <vscale x 16 x i32> @vp_bitreverse_nxv16i32(<vscale x 16 x i32> %va, <vsc
 ; CHECK-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1389,34 +1389,34 @@ define <vscale x 16 x i32> @vp_bitreverse_nxv16i32_unmasked(<vscale x 16 x i32>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
-; CHECK-NEXT:    vand.vx v24, v8, a0
-; CHECK-NEXT:    vsll.vi v24, v24, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vsll.vi v24, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -1437,68 +1437,67 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v11, v11, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vand.vx v11, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vsrl.vx v11, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v9, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v11, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1508,59 +1507,59 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV64-LABEL: vp_bitreverse_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a7, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
 ; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1580,67 +1579,67 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1650,59 +1649,59 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV64-LABEL: vp_bitreverse_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vor.vv v8, v8, v9
+; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
@@ -1723,68 +1722,67 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v14, v14, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vand.vx v14, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vsrl.vx v14, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v14, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1794,59 +1792,59 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV64-LABEL: vp_bitreverse_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a7, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1866,67 +1864,67 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v14, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v10, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v10, v16, v10
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v14, v14, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v14, v18, v14
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a4, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v10, v14, v10
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v10, v10, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v10, v10, v14
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1936,59 +1934,59 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV64-LABEL: vp_bitreverse_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v14, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v10, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v10, v16, v10
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v14, v14, a2
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v14, v8, a2
+; RV64-NEXT:    vsll.vi v14, v14, 8
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
@@ -2009,70 +2007,69 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v28, a4
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vor.vv v20, v8, v20, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 4, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v28, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v28, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
+; RV32-NEXT:    vor.vv v16, v20, v16, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v12, v0.t
+; RV32-NEXT:    vand.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vor.vv v12, v20, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -2080,59 +2077,59 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV64-LABEL: vp_bitreverse_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a7, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    ret
@@ -2152,67 +2149,67 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v20, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v12, v24, v12
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v20, v20, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v20, v28, v20
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsetvli a4, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v12, v20, v12
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v12, v12, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v12, v12, v20
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2222,59 +2219,59 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV64-LABEL: vp_bitreverse_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
+; RV64-NEXT:    vsrl.vi v16, v8, 24
+; RV64-NEXT:    vsrl.vi v20, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v12, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v12, v24, v12
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v20, v20, a2
 ; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v20, v8, a2
+; RV64-NEXT:    vsll.vi v20, v20, 8
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
@@ -2301,33 +2298,33 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
@@ -2346,14 +2343,14 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
@@ -2365,42 +2362,42 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2419,66 +2416,65 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2504,82 +2500,89 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v24, v8, v24
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2588,62 +2591,78 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ;
 ; RV64-LABEL: vp_bitreverse_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv7i64_unmasked:
@@ -2668,33 +2687,33 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
@@ -2713,14 +2732,14 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
@@ -2732,42 +2751,42 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2786,66 +2805,65 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2871,82 +2889,89 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v24, v8, v24
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2955,62 +2980,78 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ;
 ; RV64-LABEL: vp_bitreverse_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv8i64_unmasked:
@@ -3040,69 +3081,69 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a1, a3, a1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    lui a2, 3
+; CHECK-NEXT:    srli a4, a3, 1
+; CHECK-NEXT:    slli a3, a3, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
+; CHECK-NEXT:    sub a4, a0, a3
+; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a4
+; CHECK-NEXT:    lui a6, 5
+; CHECK-NEXT:    addi a4, a1, -241
+; CHECK-NEXT:    addi a2, a2, 819
+; CHECK-NEXT:    addi a1, a6, 1365
+; CHECK-NEXT:    vsetvli zero, a5, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
-; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; CHECK-NEXT:    lui a3, 3
-; CHECK-NEXT:    addi a3, a3, 819
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; CHECK-NEXT:    lui a4, 5
-; CHECK-NEXT:    addi a4, a4, 1365
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vsll.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a0, a2, .LBB46_2
+; CHECK-NEXT:    bltu a0, a3, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -3120,9 +3161,9 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 1
+; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
@@ -3144,58 +3185,58 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bitreverse_nxv64i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    lui a2, 3
+; CHECK-NEXT:    slli a3, a3, 2
+; CHECK-NEXT:    sub a4, a0, a3
+; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a4
+; CHECK-NEXT:    lui a6, 5
+; CHECK-NEXT:    addi a4, a1, -241
+; CHECK-NEXT:    addi a2, a2, 819
+; CHECK-NEXT:    addi a1, a6, 1365
+; CHECK-NEXT:    vsetvli zero, a5, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
 ; CHECK-NEXT:    vor.vv v16, v16, v24
 ; CHECK-NEXT:    vsrl.vi v24, v16, 4
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    addi a2, a2, -241
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v16, v16, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
 ; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    vsrl.vi v24, v16, 2
-; CHECK-NEXT:    lui a3, 3
-; CHECK-NEXT:    addi a3, a3, 819
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v16, v16, a3
+; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
 ; CHECK-NEXT:    vsll.vi v16, v16, 2
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    vsrl.vi v24, v16, 1
-; CHECK-NEXT:    lui a4, 5
-; CHECK-NEXT:    addi a4, a4, 1365
-; CHECK-NEXT:    vand.vx v24, v24, a4
-; CHECK-NEXT:    vand.vx v16, v16, a4
+; CHECK-NEXT:    vand.vx v16, v16, a1
+; CHECK-NEXT:    vand.vx v24, v24, a1
 ; CHECK-NEXT:    vadd.vv v16, v16, v16
 ; CHECK-NEXT:    vor.vv v16, v24, v16
-; CHECK-NEXT:    bltu a0, a1, .LBB47_2
+; CHECK-NEXT:    bltu a0, a3, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:  .LBB47_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v24
 ; CHECK-NEXT:    vsrl.vi v24, v8, 4
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v8, v8, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    vsrl.vi v24, v8, 2
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v8, v8, a3
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    vsrl.vi v24, v8, 1
-; CHECK-NEXT:    vand.vx v24, v24, a4
-; CHECK-NEXT:    vand.vx v8, v8, a4
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v24, v24, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
@@ -3228,25 +3269,25 @@ define <vscale x 1 x i9> @vp_bitreverse_nxv1i9(<vscale x 1 x i9> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index 24c6b32cbfa963..2cd763afa36b73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -124,9 +124,9 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -151,9 +151,9 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -178,9 +178,9 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsll.vi v12, v12, 8
@@ -205,9 +205,9 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
@@ -232,9 +232,9 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
 ; CHECK-NEXT:    vand.vx v24, v8, a0
 ; CHECK-NEXT:    vsll.vi v24, v24, 8
@@ -259,36 +259,36 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v10, a2
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v10, v10, a3
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vand.vx v11, v11, a0
+; RV32-NEXT:    vlse64.v v13, (a5), zero
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a0
+; RV32-NEXT:    vsll.vx v11, v11, a2
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a0
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    vsll.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vand.vx v9, v9, a4
+; RV32-NEXT:    vand.vv v12, v12, v13
+; RV32-NEXT:    vor.vv v9, v12, v9
+; RV32-NEXT:    vand.vv v12, v8, v13
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v9, v9, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -297,28 +297,28 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64-LABEL: bswap_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vx v9, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v10, v10, a2
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsrl.vx v11, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v11, v11, a2
+; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v9, v9, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vor.vv v9, v11, v9
+; RV64-NEXT:    vand.vx v11, v8, a3
+; RV64-NEXT:    vsll.vi v11, v11, 24
+; RV64-NEXT:    vor.vv v9, v9, v10
+; RV64-NEXT:    vand.vx v10, v8, a4
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vand.vx v10, v8, a3
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 8
-; RV64-NEXT:    vor.vv v10, v10, v11
 ; RV64-NEXT:    vsll.vx v11, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
@@ -343,36 +343,36 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v12, v12, a3
+; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v16, v8, a1
+; RV32-NEXT:    vand.vx v14, v14, a0
+; RV32-NEXT:    vlse64.v v18, (a5), zero
+; RV32-NEXT:    vor.vv v12, v14, v12
+; RV32-NEXT:    vand.vx v14, v8, a0
+; RV32-NEXT:    vsll.vx v14, v14, a2
+; RV32-NEXT:    vor.vv v14, v16, v14
 ; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a0
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    vsll.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vand.vx v10, v10, a4
+; RV32-NEXT:    vand.vv v16, v16, v18
+; RV32-NEXT:    vor.vv v10, v16, v10
+; RV32-NEXT:    vand.vv v16, v8, v18
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v14, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -381,28 +381,28 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64-LABEL: bswap_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vx v10, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v12, v12, a2
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v12, v12, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    vsrl.vx v14, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v14, v14, a2
+; RV64-NEXT:    vor.vv v12, v14, v12
 ; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v14, v14, a4
+; RV64-NEXT:    vor.vv v10, v14, v10
+; RV64-NEXT:    vand.vx v14, v8, a3
+; RV64-NEXT:    vsll.vi v14, v14, 24
+; RV64-NEXT:    vor.vv v10, v10, v12
+; RV64-NEXT:    vand.vx v12, v8, a4
+; RV64-NEXT:    vsll.vi v12, v12, 8
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vand.vx v12, v8, a3
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 8
-; RV64-NEXT:    vor.vv v12, v12, v14
 ; RV64-NEXT:    vsll.vx v14, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
@@ -427,36 +427,36 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v20, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v24, v8, a1
+; RV32-NEXT:    vand.vx v20, v20, a0
+; RV32-NEXT:    vlse64.v v28, (a5), zero
+; RV32-NEXT:    vor.vv v16, v20, v16
+; RV32-NEXT:    vand.vx v20, v8, a0
+; RV32-NEXT:    vsll.vx v20, v20, a2
+; RV32-NEXT:    vor.vv v20, v24, v20
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v20
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsll.vx v16, v8, a0
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    vsll.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vand.vx v24, v8, a3
-; RV32-NEXT:    vsll.vi v24, v24, 24
-; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vand.vx v12, v12, a4
+; RV32-NEXT:    vand.vv v24, v24, v28
+; RV32-NEXT:    vor.vv v12, v24, v12
+; RV32-NEXT:    vand.vv v24, v8, v28
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v12, v12, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vor.vv v8, v20, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -465,28 +465,28 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64-LABEL: bswap_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vx v12, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v16, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vi v16, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v20, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v20, v20, a2
+; RV64-NEXT:    vor.vv v16, v20, v16
 ; RV64-NEXT:    vsrl.vi v20, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v12, v12, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v20, v20, a4
+; RV64-NEXT:    vor.vv v12, v20, v12
+; RV64-NEXT:    vand.vx v20, v8, a3
+; RV64-NEXT:    vsll.vi v20, v20, 24
+; RV64-NEXT:    vor.vv v12, v12, v16
+; RV64-NEXT:    vand.vx v16, v8, a4
+; RV64-NEXT:    vsll.vi v16, v16, 8
 ; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vand.vx v16, v8, a3
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    vand.vx v20, v8, a4
-; RV64-NEXT:    vsll.vi v20, v20, 8
-; RV64-NEXT:    vor.vv v16, v16, v20
 ; RV64-NEXT:    vsll.vx v20, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
@@ -511,50 +511,57 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v24, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v0, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vx v16, v16, a2
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v0, (a5), zero
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a4
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a2
-; RV32-NEXT:    vsll.vx v0, v0, a1
-; RV32-NEXT:    vsll.vx v24, v8, a0
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -564,28 +571,28 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64-LABEL: bswap_nxv8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v16, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v24, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v0, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v0, v0, a2
+; RV64-NEXT:    vor.vv v16, v0, v16
 ; RV64-NEXT:    vsrl.vi v0, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v0, v0, a4
 ; RV64-NEXT:    vor.vv v24, v0, v24
+; RV64-NEXT:    vand.vx v0, v8, a3
+; RV64-NEXT:    vsll.vi v0, v0, 24
 ; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsll.vi v24, v24, 24
-; RV64-NEXT:    vand.vx v0, v8, a4
-; RV64-NEXT:    vsll.vi v0, v0, 8
-; RV64-NEXT:    vor.vv v24, v24, v0
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsll.vi v24, v24, 8
+; RV64-NEXT:    vor.vv v24, v0, v24
 ; RV64-NEXT:    vsll.vx v0, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 2c5b7f160d1924..1c95ec8fafd4f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -270,9 +270,9 @@ define <vscale x 1 x i32> @vp_bswap_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -324,9 +324,9 @@ define <vscale x 2 x i32> @vp_bswap_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -378,9 +378,9 @@ define <vscale x 4 x i32> @vp_bswap_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsll.vi v12, v12, 8
@@ -432,9 +432,9 @@ define <vscale x 8 x i32> @vp_bswap_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
@@ -486,9 +486,9 @@ define <vscale x 16 x i32> @vp_bswap_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
 ; CHECK-NEXT:    vand.vx v24, v8, a0
 ; CHECK-NEXT:    vsll.vi v24, v24, 8
@@ -514,38 +514,38 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -554,31 +554,31 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV64-LABEL: vp_bswap_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
@@ -599,39 +599,39 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -639,34 +639,34 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV64-LABEL: vp_bswap_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64_unmasked:
@@ -686,38 +686,38 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v14, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -726,31 +726,31 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV64-LABEL: vp_bswap_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a5, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
-; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
@@ -771,39 +771,39 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v14, v16, v14
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v10, v10, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v10, v18, v10
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -811,34 +811,34 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV64-LABEL: vp_bswap_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
+; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v14, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v12, v8, a2
+; RV64-NEXT:    vsll.vi v12, v12, 8
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v10, v10, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64_unmasked:
@@ -858,34 +858,34 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
@@ -898,31 +898,31 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV64-LABEL: vp_bswap_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
@@ -943,39 +943,39 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v20, v24, v20
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v12, v28, v12
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -983,34 +983,34 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV64-LABEL: vp_bswap_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v20, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
-; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    vor.vv v12, v12, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64_unmasked:
@@ -1035,33 +1035,33 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1080,14 +1080,14 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1123,36 +1123,35 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -1181,51 +1180,59 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1234,35 +1241,51 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_bswap_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64_unmasked:
@@ -1287,33 +1310,33 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1332,14 +1355,14 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1375,36 +1398,35 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -1433,51 +1455,59 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1486,35 +1516,51 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_bswap_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64_unmasked:
@@ -1546,9 +1592,9 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1588,9 +1634,9 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-ZVKB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVKB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVKB-NEXT:    srli a2, a1, 1
+; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
 ; CHECK-ZVKB-NEXT:    addi a3, a3, -1
@@ -1661,38 +1707,38 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1702,31 +1748,31 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV64-LABEL: vp_bswap_nxv1i48:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index c22d01987b7bd8..a4e5ab661c5285 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -70,24 +70,24 @@ define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) {
 ; CHECK-LABEL: ret_split_nxv64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vl8re32.v v8, (a1)
 ; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    slli a4, a2, 5
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub a4, a4, a3
-; CHECK-NEXT:    add a5, a1, a4
-; CHECK-NEXT:    vl8re32.v v8, (a5)
+; CHECK-NEXT:    add a5, a1, a2
+; CHECK-NEXT:    vl8re32.v v16, (a5)
 ; CHECK-NEXT:    add a5, a1, a3
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    vl8re32.v v16, (a1)
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    vl8re32.v v24, (a1)
-; CHECK-NEXT:    vl8re32.v v0, (a5)
-; CHECK-NEXT:    vs8r.v v16, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs8r.v v24, (a2)
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vs8r.v v0, (a3)
-; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    vl8re32.v v24, (a5)
+; CHECK-NEXT:    vl8re32.v v0, (a1)
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vs8r.v v24, (a3)
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    vs8r.v v0, (a0)
 ; CHECK-NEXT:    ret
   %v = load <vscale x 64 x i32>, ptr %x
   ret <vscale x 64 x i32> %v
@@ -100,87 +100,99 @@ define fastcc <vscale x 128 x i32> @ret_split_nxv128i32(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    sub a5, a4, a3
-; CHECK-NEXT:    add a6, a1, a5
-; CHECK-NEXT:    vl8re32.v v8, (a6)
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    li a7, 24
-; CHECK-NEXT:    mul a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    vl8re32.v v8, (a1)
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    slli a5, a2, 5
 ; CHECK-NEXT:    slli a6, a2, 4
 ; CHECK-NEXT:    slli a7, a2, 6
+; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    sub a3, a5, a4
 ; CHECK-NEXT:    sub t0, a7, a6
-; CHECK-NEXT:    add t1, a1, t0
+; CHECK-NEXT:    sub a7, a7, a4
+; CHECK-NEXT:    add t1, a1, a4
+; CHECK-NEXT:    add t2, a1, a6
+; CHECK-NEXT:    add t3, a1, a5
 ; CHECK-NEXT:    vl8re32.v v8, (t1)
 ; CHECK-NEXT:    csrr t1, vlenb
-; CHECK-NEXT:    slli t1, t1, 4
+; CHECK-NEXT:    li t4, 24
+; CHECK-NEXT:    mul t1, t1, t4
 ; CHECK-NEXT:    add t1, sp, t1
 ; CHECK-NEXT:    addi t1, t1, 16
 ; CHECK-NEXT:    vs8r.v v8, (t1) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a7, a7, a3
-; CHECK-NEXT:    add t1, a1, a7
+; CHECK-NEXT:    add t1, a1, a2
+; CHECK-NEXT:    vl8re32.v v8, (t2)
+; CHECK-NEXT:    csrr t2, vlenb
+; CHECK-NEXT:    slli t2, t2, 3
+; CHECK-NEXT:    add t2, sp, t2
+; CHECK-NEXT:    addi t2, t2, 16
+; CHECK-NEXT:    vs8r.v v8, (t2) # Unknown-size Folded Spill
+; CHECK-NEXT:    add t2, a1, a3
+; CHECK-NEXT:    vl8re32.v v16, (t3)
+; CHECK-NEXT:    add t3, a1, t0
+; CHECK-NEXT:    add a1, a1, a7
 ; CHECK-NEXT:    vl8re32.v v8, (t1)
+; CHECK-NEXT:    vl8re32.v v24, (t2)
 ; CHECK-NEXT:    csrr t1, vlenb
-; CHECK-NEXT:    slli t1, t1, 3
+; CHECK-NEXT:    slli t1, t1, 4
 ; CHECK-NEXT:    add t1, sp, t1
 ; CHECK-NEXT:    addi t1, t1, 16
-; CHECK-NEXT:    vs8r.v v8, (t1) # Unknown-size Folded Spill
-; CHECK-NEXT:    add t1, a1, a3
-; CHECK-NEXT:    vl8re32.v v8, (t1)
+; CHECK-NEXT:    vs8r.v v24, (t1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v24, (t3)
 ; CHECK-NEXT:    addi t1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (t1) # Unknown-size Folded Spill
-; CHECK-NEXT:    add t1, a1, a6
-; CHECK-NEXT:    add t2, a1, a4
-; CHECK-NEXT:    li t3, 40
-; CHECK-NEXT:    mul a2, a2, t3
-; CHECK-NEXT:    add t3, a1, a2
-; CHECK-NEXT:    vl8re32.v v8, (a1)
-; CHECK-NEXT:    vl8re32.v v0, (t1)
-; CHECK-NEXT:    vl8re32.v v16, (t3)
-; CHECK-NEXT:    vl8re32.v v24, (t2)
-; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v24, (t1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v24, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v0, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs8r.v v16, (a2)
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vs8r.v v24, (a4)
+; CHECK-NEXT:    vs8r.v v8, (a2)
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    vs8r.v v16, (a5)
 ; CHECK-NEXT:    add a6, a0, a6
-; CHECK-NEXT:    vs8r.v v0, (a6)
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vs8r.v v8, (a3)
-; CHECK-NEXT:    add a7, a0, a7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vs8r.v v8, (a7)
-; CHECK-NEXT:    add t0, a0, t0
+; CHECK-NEXT:    vs8r.v v8, (a6)
+; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a4)
+; CHECK-NEXT:    add a7, a0, a7
+; CHECK-NEXT:    vs8r.v v24, (a7)
+; CHECK-NEXT:    add t0, a0, t0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vs8r.v v8, (t0)
-; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -236,40 +248,52 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vl8re32.v v0, (a1)
-; CHECK-NEXT:    vl8re32.v v16, (a3)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vl8re32.v v8, (a2)
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v0, (a0)
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vl8re32.v v16, (a2)
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vv v24, v8, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v0, v8, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vadd.vv v0, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v24, v0, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vadd.vv v8, v0, v8
-; CHECK-NEXT:    vadd.vv v24, v24, v16
+; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    vadd.vx v16, v8, a4
 ; CHECK-NEXT:    vadd.vx v8, v24, a4
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -306,10 +330,10 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
 ; RV32-NEXT:    add a3, a0, a1
 ; RV32-NEXT:    vl8re32.v v24, (a3)
 ; RV32-NEXT:    vl8re32.v v0, (a0)
+; RV32-NEXT:    addi a3, sp, 128
 ; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vs8r.v v8, (a3)
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    li a3, 2
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv8r.v v8, v0
@@ -344,10 +368,10 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
 ; RV64-NEXT:    add a3, a0, a1
 ; RV64-NEXT:    vl8re32.v v24, (a3)
 ; RV64-NEXT:    vl8re32.v v0, (a0)
+; RV64-NEXT:    addi a3, sp, 128
 ; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vs8r.v v8, (a3)
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    li a3, 2
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv8r.v v8, v0
@@ -382,35 +406,35 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    andi sp, sp, -128
-; RV32-NEXT:    vmv8r.v v0, v16
+; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a3, a2, a1
-; RV32-NEXT:    vl8re32.v v16, (a3)
+; RV32-NEXT:    vl8re32.v v16, (a2)
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 128
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a2, a2, a1
 ; RV32-NEXT:    add a3, a0, a1
-; RV32-NEXT:    vl8re32.v v16, (a3)
-; RV32-NEXT:    addi a3, sp, 128
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8re32.v v24, (a2)
+; RV32-NEXT:    vl8re32.v v0, (a2)
+; RV32-NEXT:    vl8re32.v v24, (a3)
 ; RV32-NEXT:    vl8re32.v v16, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 128
-; RV32-NEXT:    vs8r.v v16, (a2)
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 128
+; RV32-NEXT:    vs8r.v v16, (a3)
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vs8r.v v0, (a0)
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    addi a2, sp, 128
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
@@ -419,16 +443,15 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 128
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    li a5, 42
-; RV32-NEXT:    addi a3, sp, 128
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vs8r.v v8, (a1)
-; RV32-NEXT:    vmv8r.v v8, v24
+; RV32-NEXT:    vs8r.v v24, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 128
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv8r.v v16, v0
 ; RV32-NEXT:    call ext3
 ; RV32-NEXT:    addi sp, s0, -144
 ; RV32-NEXT:    .cfi_def_cfa sp, 144
@@ -455,35 +478,35 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    vmv8r.v v0, v16
+; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a3, a2, a1
-; RV64-NEXT:    vl8re32.v v16, (a3)
+; RV64-NEXT:    vl8re32.v v16, (a2)
 ; RV64-NEXT:    csrr a3, vlenb
 ; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 128
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a2, a2, a1
 ; RV64-NEXT:    add a3, a0, a1
-; RV64-NEXT:    vl8re32.v v16, (a3)
-; RV64-NEXT:    addi a3, sp, 128
-; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    vl8re32.v v24, (a2)
+; RV64-NEXT:    vl8re32.v v0, (a2)
+; RV64-NEXT:    vl8re32.v v24, (a3)
 ; RV64-NEXT:    vl8re32.v v16, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 5
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    vs8r.v v16, (a2)
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 128
+; RV64-NEXT:    vs8r.v v16, (a3)
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vs8r.v v0, (a0)
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    addi a2, sp, 128
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    add a0, sp, a0
@@ -492,16 +515,15 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV64-NEXT:    slli a2, a2, 4
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 128
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    li a5, 42
-; RV64-NEXT:    addi a3, sp, 128
-; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV64-NEXT:    vs8r.v v8, (a1)
-; RV64-NEXT:    vmv8r.v v8, v24
+; RV64-NEXT:    vs8r.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 128
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv8r.v v16, v0
 ; RV64-NEXT:    call ext3
 ; RV64-NEXT:    addi sp, s0, -144
 ; RV64-NEXT:    .cfi_def_cfa sp, 144
@@ -559,29 +581,29 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    addi a1, s1, 128
-; RV32-NEXT:    vs8r.v v8, (a1)
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, s1, a2
-; RV32-NEXT:    addi a2, a2, 128
-; RV32-NEXT:    vs8r.v v8, (a2)
-; RV32-NEXT:    li a3, 8
-; RV32-NEXT:    sw a3, 0(sp)
-; RV32-NEXT:    add a1, a1, a0
-; RV32-NEXT:    vs8r.v v8, (a1)
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    addi t0, s1, 128
+; RV32-NEXT:    csrr t1, vlenb
+; RV32-NEXT:    slli t1, t1, 4
+; RV32-NEXT:    add t1, s1, t1
+; RV32-NEXT:    addi t1, t1, 128
+; RV32-NEXT:    li a7, 8
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    li a2, 2
 ; RV32-NEXT:    li a3, 3
 ; RV32-NEXT:    li a4, 4
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    vs8r.v v8, (t0)
+; RV32-NEXT:    vs8r.v v8, (t1)
+; RV32-NEXT:    sw a7, 0(sp)
 ; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    add t0, t0, a0
+; RV32-NEXT:    add a0, t1, a0
 ; RV32-NEXT:    csrr t3, vlenb
 ; RV32-NEXT:    slli t3, t3, 4
 ; RV32-NEXT:    add t3, s1, t3
 ; RV32-NEXT:    addi t3, t3, 128
+; RV32-NEXT:    vs8r.v v8, (t0)
 ; RV32-NEXT:    addi t5, s1, 128
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    li a0, 0
@@ -622,29 +644,29 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    addi a1, s1, 128
-; RV64-NEXT:    vs8r.v v8, (a1)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    add a2, s1, a2
-; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    vs8r.v v8, (a2)
-; RV64-NEXT:    li a3, 8
-; RV64-NEXT:    sd a3, 0(sp)
-; RV64-NEXT:    add a1, a1, a0
-; RV64-NEXT:    vs8r.v v8, (a1)
-; RV64-NEXT:    add a0, a2, a0
+; RV64-NEXT:    addi t0, s1, 128
+; RV64-NEXT:    csrr t1, vlenb
+; RV64-NEXT:    slli t1, t1, 4
+; RV64-NEXT:    add t1, s1, t1
+; RV64-NEXT:    addi t1, t1, 128
+; RV64-NEXT:    li a7, 8
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    li a2, 2
 ; RV64-NEXT:    li a3, 3
 ; RV64-NEXT:    li a4, 4
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    vs8r.v v8, (t0)
+; RV64-NEXT:    vs8r.v v8, (t1)
+; RV64-NEXT:    sd a7, 0(sp)
 ; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    add t0, t0, a0
+; RV64-NEXT:    add a0, t1, a0
 ; RV64-NEXT:    csrr t3, vlenb
 ; RV64-NEXT:    slli t3, t3, 4
 ; RV64-NEXT:    add t3, s1, t3
 ; RV64-NEXT:    addi t3, t3, 128
+; RV64-NEXT:    vs8r.v v8, (t0)
 ; RV64-NEXT:    addi t5, s1, 128
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    li a0, 0
@@ -710,7 +732,6 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    addi a0, s1, 64
-; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    li a2, 2
 ; RV32-NEXT:    li a3, 3
@@ -722,6 +743,7 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV32-NEXT:    li t4, 9
 ; RV32-NEXT:    li t5, 10
 ; RV32-NEXT:    li t6, 11
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
@@ -760,7 +782,6 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    addi a0, s1, 64
-; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    li a2, 2
 ; RV64-NEXT:    li a3, 3
@@ -772,6 +793,7 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV64-NEXT:    li t4, 9
 ; RV64-NEXT:    li t5, 10
 ; RV64-NEXT:    li t6, 11
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    sd a0, 0(sp)
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    vmv.v.i v16, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 277d8c9d55eaf2..9b27116fef7cae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -36,12 +36,12 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv.v.i v16, 0
@@ -71,12 +71,12 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv.v.i v16, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index ee2946a20b17e1..7d0b0118a72725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vs
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va,
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bflo
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; CHECK-LABEL: vp_ceil_vv_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
index ee4be0cf865a2d..400dfd393509c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -197,88 +197,82 @@ entry:
 define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data) {
 ; RV64-LABEL: test_compresstore_v256i8:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v7, v8
 ; RV64-NEXT:    li a2, 128
-; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vle8.v v16, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v0, 1
-; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    vmv.x.s a3, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v16, v24, v0
+; RV64-NEXT:    vle8.v v24, (a1)
+; RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v16, v0
 ; RV64-NEXT:    vcpop.m a4, v0
 ; RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
-; RV64-NEXT:    vse8.v v16, (a0)
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v16, v24, v8
-; RV64-NEXT:    vcpop.m a2, v8
+; RV64-NEXT:    vcpop.m a2, v7
 ; RV64-NEXT:    cpop a3, a3
 ; RV64-NEXT:    cpop a1, a1
 ; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    vcompress.vm v8, v24, v7
 ; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vse8.v v16, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v256i8:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    vmv1r.v v7, v8
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
 ; RV32-NEXT:    li a2, 128
-; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vle8.v v24, (a1)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v0, 1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v9, a1
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsrl.vx v10, v0, a1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vmv.x.s a4, v9
-; RV32-NEXT:    vmv.x.s a5, v0
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vmv.x.s a4, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v16, (a1)
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v9, a3
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vsrl.vx v9, v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v16, v0
-; RV32-NEXT:    vcpop.m a6, v0
-; RV32-NEXT:    vsetvli zero, a6, e8, m8, ta, ma
-; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    vcompress.vm v16, v24, v0
+; RV32-NEXT:    vcpop.m a3, v0
+; RV32-NEXT:    cpop a4, a4
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a6, v9
+; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32-NEXT:    vse8.v v16, (a0)
 ; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    cpop a3, a6
 ; RV32-NEXT:    cpop a5, a5
-; RV32-NEXT:    add a1, a5, a1
-; RV32-NEXT:    cpop a3, a3
-; RV32-NEXT:    cpop a4, a4
 ; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v24, v7
-; RV32-NEXT:    vcpop.m a1, v7
+; RV32-NEXT:    vcompress.vm v16, v24, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vcpop.m a1, v8
 ; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    vse8.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask)
@@ -463,43 +457,45 @@ define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %da
 ; RV64-NEXT:    vse16.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v0, 8
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.x.s a2, v0
 ; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v24, v16, v8
-; RV64-NEXT:    vcpop.m a2, v8
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    cpop a1, a1
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV64-NEXT:    vcpop.m a1, v8
+; RV64-NEXT:    cpop a2, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; RV64-NEXT:    vse16.v v24, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v128i16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 64
-; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v24, v8, v0
-; RV32-NEXT:    vcpop.m a2, v0
-; RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; RV32-NEXT:    vse16.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 8
+; RV32-NEXT:    vslidedown.vi v7, v0, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v24, v16, v7
+; RV32-NEXT:    vcpop.m a2, v7
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a4, v0
 ; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v16, v24
-; RV32-NEXT:    vcpop.m a1, v24
-; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vcompress.vm v16, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v0, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    cpop a2, a2
-; RV32-NEXT:    vmv.x.s a3, v0
-; RV32-NEXT:    cpop a3, a3
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    slli a2, a2, 1
-; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vsrl.vx v8, v0, a3
 ; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV32-NEXT:    vse16.v v8, (a0)
+; RV32-NEXT:    vse16.v v16, (a0)
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    cpop a3, a4
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV32-NEXT:    vse16.v v24, (a0)
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v128i16(<128 x i16> %data, ptr align 2 %p, <128 x i1> %mask)
@@ -659,10 +655,11 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data)
 ; RV64-NEXT:    vse32.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v0, 4
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.x.s a2, v0
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v24, v16, v8
 ; RV64-NEXT:    vcpop.m a1, v8
-; RV64-NEXT:    vmv.x.s a2, v0
 ; RV64-NEXT:    cpopw a2, a2
 ; RV64-NEXT:    slli a2, a2, 2
 ; RV64-NEXT:    add a0, a0, a2
@@ -680,10 +677,11 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data)
 ; RV32-NEXT:    vse32.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v0, 4
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.x.s a2, v0
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vcompress.vm v24, v16, v8
 ; RV32-NEXT:    vcpop.m a1, v8
-; RV32-NEXT:    vmv.x.s a2, v0
 ; RV32-NEXT:    cpop a2, a2
 ; RV32-NEXT:    slli a2, a2, 2
 ; RV32-NEXT:    add a0, a0, a2
@@ -822,10 +820,10 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data)
 ; RV64-NEXT:    vse64.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v0, 2
+; RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT:    vmv.x.s a1, v0
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v24, v16, v8
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
 ; RV64-NEXT:    zext.h a1, a1
 ; RV64-NEXT:    cpopw a1, a1
 ; RV64-NEXT:    slli a1, a1, 3
@@ -844,10 +842,10 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data)
 ; RV32-NEXT:    vse64.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v0, 2
+; RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT:    vmv.x.s a1, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vcompress.vm v24, v16, v8
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
 ; RV32-NEXT:    zext.h a1, a1
 ; RV32-NEXT:    cpop a1, a1
 ; RV32-NEXT:    slli a1, a1, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index ade1b4203148d8..ad176df71397e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -27,16 +27,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV32-NEXT:    vmsne.vi v0, v11, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v11, 10
 ; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV32-NEXT:    vrgather.vi v9, v8, 0
-; RV32-NEXT:    vmsne.vi v0, v9, 0
+; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vrgather.vi v10, v9, 0
+; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 10
-; RV32-NEXT:    vse32.v v8, (a0), v0.t
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vse32.v v11, (a0), v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: constant_folding_crash:
@@ -50,16 +52,18 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan
 ; RV64-NEXT:    vmsne.vi v0, v13, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
-; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmv1r.v v0, v12
-; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV64-NEXT:    vrgather.vi v9, v8, 0
-; RV64-NEXT:    vmsne.vi v0, v9, 0
+; RV64-NEXT:    vmv.v.i v9, 0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 10
-; RV64-NEXT:    vse32.v v8, (a0), v0.t
+; RV64-NEXT:    vmv.v.i v10, 10
+; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV64-NEXT:    vrgather.vi v11, v9, 0
+; RV64-NEXT:    vmsne.vi v0, v11, 0
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vse32.v v10, (a0), v0.t
 ; RV64-NEXT:    ret
 entry:
   %sunkaddr = getelementptr i8, ptr %v54, i64 8
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index d51f5eacd7d91a..208735b18cbab5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -13,6 +13,7 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -20,10 +21,9 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -37,11 +37,11 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -51,11 +51,11 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -76,6 +76,7 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -83,10 +84,9 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -100,11 +100,11 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -114,11 +114,11 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -139,6 +139,7 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -146,10 +147,9 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -163,11 +163,11 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
 ; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -177,11 +177,11 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
 ; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -202,6 +202,7 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -209,10 +210,9 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -226,11 +226,11 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
 ; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v10, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -240,11 +240,11 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
 ; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -265,6 +265,7 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
@@ -272,10 +273,9 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -289,11 +289,11 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
 ; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v12, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -303,11 +303,11 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v12, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
 ; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -328,6 +328,7 @@ define <vscale x 32 x i8> @ctlz_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -335,10 +336,9 @@ define <vscale x 32 x i8> @ctlz_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -363,6 +363,7 @@ define <vscale x 64 x i8> @ctlz_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -370,10 +371,9 @@ define <vscale x 64 x i8> @ctlz_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -398,7 +398,9 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -407,20 +409,18 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -431,8 +431,8 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -442,8 +442,8 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -464,7 +464,9 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -473,20 +475,18 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -497,8 +497,8 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -508,8 +508,8 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -530,7 +530,9 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -539,20 +541,18 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -563,8 +563,8 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -574,8 +574,8 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -596,7 +596,9 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -605,20 +607,18 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -629,8 +629,8 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -640,8 +640,8 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -662,7 +662,9 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -671,20 +673,18 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -695,8 +695,8 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -706,8 +706,8 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -728,7 +728,9 @@ define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -737,20 +739,18 @@ define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -772,7 +772,9 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -783,20 +785,18 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -809,8 +809,8 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -844,7 +844,9 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -855,20 +857,18 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -881,8 +881,8 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -916,7 +916,9 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -927,20 +929,18 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -953,8 +953,8 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -988,7 +988,9 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -999,20 +1001,18 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1025,8 +1025,8 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -1060,7 +1060,9 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
@@ -1071,20 +1073,18 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1097,8 +1097,8 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -1110,8 +1110,8 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    li a1, 158
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 32
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a1
@@ -1133,6 +1133,12 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v9
@@ -1142,40 +1148,34 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v9, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1185,6 +1185,23 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v9
@@ -1194,37 +1211,20 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a0
+; RV64I-NEXT:    vsrl.vx v9, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1232,16 +1232,16 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv1i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v9, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
-; CHECK-F-NEXT:    vwsubu.vv v10, v9, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v10, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v10, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv1i64:
@@ -1273,6 +1273,12 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v10
@@ -1282,40 +1288,34 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v10, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1325,6 +1325,23 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v10
@@ -1334,37 +1351,20 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a0
+; RV64I-NEXT:    vsrl.vx v10, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1372,16 +1372,16 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv2i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v10, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v11, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v11, 23
-; CHECK-F-NEXT:    vwsubu.vv v12, v10, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v9, v10, 23
+; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v12, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v10, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv2i64:
@@ -1413,6 +1413,12 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v12
@@ -1422,40 +1428,34 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v12
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v12, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1465,6 +1465,23 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v12
@@ -1474,37 +1491,20 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v12
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v12, v8, a0
+; RV64I-NEXT:    vsrl.vx v12, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1512,16 +1512,16 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv4i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v12, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v14, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v14, 23
-; CHECK-F-NEXT:    vwsubu.vv v16, v12, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v10, v12, 23
+; CHECK-F-NEXT:    vwsubu.vv v12, v8, v10
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v16, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v12, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv4i64:
@@ -1553,6 +1553,12 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vmv.v.x v24, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v16
@@ -1562,41 +1568,35 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v16
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v16, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -1605,6 +1605,23 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v16
@@ -1614,37 +1631,20 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v16
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vx v16, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1652,16 +1652,16 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv8i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v16, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v20, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v20, 23
-; CHECK-F-NEXT:    vwsubu.vv v24, v16, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v12, v16, 23
+; CHECK-F-NEXT:    vwsubu.vv v16, v8, v12
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v24, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v16, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv8i64:
@@ -1693,6 +1693,7 @@ define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1700,10 +1701,9 @@ define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1751,6 +1751,7 @@ define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1758,10 +1759,9 @@ define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1809,6 +1809,7 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1816,10 +1817,9 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1867,6 +1867,7 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1874,10 +1875,9 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1925,6 +1925,7 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
@@ -1932,10 +1933,9 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1983,6 +1983,7 @@ define <vscale x 32 x i8> @ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -1990,10 +1991,9 @@ define <vscale x 32 x i8> @ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2017,6 +2017,7 @@ define <vscale x 64 x i8> @ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -2024,10 +2025,9 @@ define <vscale x 64 x i8> @ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2051,7 +2051,9 @@ define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2060,20 +2062,18 @@ define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2112,7 +2112,9 @@ define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2121,20 +2123,18 @@ define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2173,7 +2173,9 @@ define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2182,20 +2184,18 @@ define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2234,7 +2234,9 @@ define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -2243,20 +2245,18 @@ define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2295,7 +2295,9 @@ define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -2304,20 +2306,18 @@ define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2356,7 +2356,9 @@ define <vscale x 32 x i16> @ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -2365,20 +2367,18 @@ define <vscale x 32 x i16> @ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2399,7 +2399,9 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2410,20 +2412,18 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2466,7 +2466,9 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2477,20 +2479,18 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2533,7 +2533,9 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -2544,20 +2546,18 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2600,7 +2600,9 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -2611,20 +2613,18 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2667,7 +2667,9 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
@@ -2678,20 +2680,18 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2735,6 +2735,12 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v9
@@ -2744,40 +2750,34 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v9, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2787,6 +2787,23 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v9
@@ -2796,37 +2813,20 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a0
+; RV64I-NEXT:    vsrl.vx v9, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2869,6 +2869,12 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v10
@@ -2878,40 +2884,34 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v10, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2921,6 +2921,23 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v10
@@ -2930,37 +2947,20 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a0
+; RV64I-NEXT:    vsrl.vx v10, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3003,6 +3003,12 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v12
@@ -3012,40 +3018,34 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v12
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v12, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -3055,6 +3055,23 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v12
@@ -3064,37 +3081,20 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v12
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v12, v8, a0
+; RV64I-NEXT:    vsrl.vx v12, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3137,6 +3137,12 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vmv.v.x v24, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v16
@@ -3146,41 +3152,35 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v16
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v16, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -3189,6 +3189,23 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v16
@@ -3198,37 +3215,20 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v16
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vx v16, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 2c9f633b890143..f56a792fdef6a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -15,6 +15,7 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -22,7 +23,6 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -42,11 +42,11 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -68,6 +68,7 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -75,7 +76,6 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -95,11 +95,11 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -121,6 +121,7 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
@@ -128,7 +129,6 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -148,11 +148,11 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -174,6 +174,7 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
@@ -181,7 +182,6 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -201,11 +201,11 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -227,6 +227,7 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
@@ -234,7 +235,6 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -254,11 +254,11 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -280,6 +280,7 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -287,10 +288,9 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -314,6 +314,7 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -321,10 +322,9 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -350,6 +350,7 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -357,10 +358,9 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -384,6 +384,7 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -391,10 +392,9 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -420,11 +420,11 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -444,8 +444,8 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -467,11 +467,11 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -491,8 +491,8 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -514,11 +514,11 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -538,8 +538,8 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -561,11 +561,11 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -585,8 +585,8 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -608,11 +608,11 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -632,8 +632,8 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -655,7 +655,9 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -664,20 +666,18 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -698,7 +698,9 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -707,20 +709,18 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -746,9 +746,9 @@ define <vscale x 1 x i32> @vp_ctlz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -795,9 +795,9 @@ define <vscale x 2 x i32> @vp_ctlz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -844,9 +844,9 @@ define <vscale x 4 x i32> @vp_ctlz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -893,9 +893,9 @@ define <vscale x 8 x i32> @vp_ctlz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -940,8 +940,8 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -963,8 +963,8 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -1237,20 +1237,20 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    fsrmi a4, 1
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    sub a5, a0, a1
+; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    sltu a3, a0, a5
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    and a5, a3, a5
+; CHECK-NEXT:    li a3, 1086
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    li a3, 1086
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3, v0.t
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    vminu.vx v16, v16, a4, v0.t
@@ -1273,12 +1273,12 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
@@ -1297,17 +1297,17 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-LABEL: vp_ctlz_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    fsrmi a4, 1
 ; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsrl.vx v16, v16, a2
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a5, a0, a3
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a3
 ; CHECK-NEXT:    li a3, 1086
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    fsrm a4
+; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    vminu.vx v16, v16, a4
@@ -1349,6 +1349,7 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -1356,7 +1357,6 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1397,6 +1397,7 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -1404,7 +1405,6 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1445,6 +1445,7 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
@@ -1452,7 +1453,6 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1493,6 +1493,7 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
@@ -1500,7 +1501,6 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1541,6 +1541,7 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
@@ -1548,7 +1549,6 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1589,6 +1589,7 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -1596,10 +1597,9 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -1623,6 +1623,7 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -1630,10 +1631,9 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -1658,6 +1658,7 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1665,10 +1666,9 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -1692,6 +1692,7 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -1699,10 +1700,9 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -1727,11 +1727,11 @@ define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1769,11 +1769,11 @@ define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1811,11 +1811,11 @@ define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1853,11 +1853,11 @@ define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1895,11 +1895,11 @@ define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1937,7 +1937,9 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -1946,20 +1948,18 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -1980,7 +1980,9 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -1989,20 +1991,18 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2027,9 +2027,9 @@ define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2071,9 +2071,9 @@ define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2115,9 +2115,9 @@ define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2159,9 +2159,9 @@ define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2201,8 +2201,8 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
@@ -2467,18 +2467,18 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a4, a2, a4
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
 ; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    li a3, 1086
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3, v0.t
@@ -2500,12 +2500,12 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB94_2
@@ -2524,15 +2524,15 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a4, a4, a2
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    li a3, 1086
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3
@@ -2576,12 +2576,12 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -2606,12 +2606,12 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2629,13 +2629,13 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2686,13 +2686,13 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2743,14 +2743,14 @@ define <vscale x 1 x i9> @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor(<vsc
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 23
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 7
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index c310274d685081..fa8e332e5076de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -11,8 +11,8 @@ define <vscale x 1 x i8> @ctpop_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -39,8 +39,8 @@ define <vscale x 2 x i8> @ctpop_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -67,8 +67,8 @@ define <vscale x 4 x i8> @ctpop_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -95,8 +95,8 @@ define <vscale x 8 x i8> @ctpop_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -123,8 +123,8 @@ define <vscale x 16 x i8> @ctpop_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -151,8 +151,8 @@ define <vscale x 32 x i8> @ctpop_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -179,8 +179,8 @@ define <vscale x 64 x i8> @ctpop_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -208,17 +208,17 @@ define <vscale x 1 x i16> @ctpop_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -243,17 +243,17 @@ define <vscale x 2 x i16> @ctpop_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -278,17 +278,17 @@ define <vscale x 4 x i16> @ctpop_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -313,17 +313,17 @@ define <vscale x 8 x i16> @ctpop_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -348,17 +348,17 @@ define <vscale x 16 x i16> @ctpop_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -383,17 +383,17 @@ define <vscale x 32 x i16> @ctpop_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -418,17 +418,17 @@ define <vscale x 1 x i32> @ctpop_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -454,17 +454,17 @@ define <vscale x 2 x i32> @ctpop_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -490,17 +490,17 @@ define <vscale x 4 x i32> @ctpop_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -526,17 +526,17 @@ define <vscale x 8 x i32> @ctpop_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -562,17 +562,17 @@ define <vscale x 16 x i32> @ctpop_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -681,31 +681,31 @@ define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -713,34 +713,34 @@ define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -764,31 +764,31 @@ define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -796,34 +796,34 @@ define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -847,31 +847,31 @@ define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a0
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -879,34 +879,34 @@ define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV64-NEXT:    vsrl.vi v12, v8, 1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -930,66 +930,66 @@ define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vmv.v.x v24, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctpop_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 093eb0ead313ef..9e75dc9dccffde 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -17,8 +17,8 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -44,8 +44,8 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -73,8 +73,8 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -100,8 +100,8 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -129,8 +129,8 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -156,8 +156,8 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -185,8 +185,8 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -212,8 +212,8 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -241,8 +241,8 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -268,8 +268,8 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -297,8 +297,8 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -324,8 +324,8 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -353,8 +353,8 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -380,8 +380,8 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -410,17 +410,17 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -444,17 +444,17 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -480,17 +480,17 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -514,17 +514,17 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -550,17 +550,17 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -584,17 +584,17 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -620,17 +620,17 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -654,17 +654,17 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -690,17 +690,17 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -724,17 +724,17 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16_unmasked(<vscale x 16 x i16> %va,
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -760,17 +760,17 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -794,17 +794,17 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16_unmasked(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -830,17 +830,17 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -865,17 +865,17 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -902,17 +902,17 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -937,17 +937,17 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -974,17 +974,17 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1009,17 +1009,17 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1046,17 +1046,17 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1081,17 +1081,17 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1118,17 +1118,17 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1153,17 +1153,17 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1191,67 +1191,67 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1273,31 +1273,31 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1305,34 +1305,34 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1357,67 +1357,67 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1439,31 +1439,31 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1471,34 +1471,34 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1523,67 +1523,67 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v16, v12, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1605,31 +1605,31 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1637,34 +1637,34 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1689,67 +1689,67 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1771,66 +1771,66 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1855,67 +1855,67 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1937,66 +1937,66 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2018,54 +2018,53 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a2, a1, 3
-; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a3
+; RV32-NEXT:    sub a3, a0, a1
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 209715
@@ -2073,13 +2072,13 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -2088,7 +2087,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -2102,8 +2101,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -2114,7 +2112,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -2122,75 +2120,61 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
 ; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2198,26 +2182,22 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -2250,39 +2230,39 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a6, a2, a6
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    add a7, a3, a2
+; RV64-NEXT:    slli a2, a4, 32
+; RV64-NEXT:    add a2, a4, a2
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    li a4, 56
 ; RV64-NEXT:    sub a1, a0, a1
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a7, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a4, v0.t
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
@@ -2291,17 +2271,17 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2317,12 +2297,12 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
@@ -2347,27 +2327,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    sub a4, a0, a1
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    sltu a2, a0, a4
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v16, v16, v24
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2383,8 +2362,10 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a4, a4, 257
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2393,10 +2374,8 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2448,59 +2427,69 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB47_2
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB47_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB47_2:
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    slli a7, a4, 32
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    slli a7, a5, 32
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    li a7, 56
+; RV64-NEXT:    sub a2, a0, a2
+; RV64-NEXT:    sltu a0, a0, a2
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a4
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
@@ -2532,21 +2521,21 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index 3bddcf798f66bf..5761ae0926eae4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -13,14 +13,14 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -34,6 +34,7 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -41,7 +42,6 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -51,6 +51,7 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -58,7 +59,6 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -79,14 +79,14 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -100,6 +100,7 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -107,7 +108,6 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -117,6 +117,7 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -124,7 +125,6 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -145,14 +145,14 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -166,6 +166,7 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -173,7 +174,6 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -183,6 +183,7 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -190,7 +191,6 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -211,14 +211,14 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -232,6 +232,7 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -239,7 +240,6 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v10, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -249,6 +249,7 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -256,7 +257,6 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v10, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -277,14 +277,14 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -298,6 +298,7 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v12, v10
@@ -305,7 +306,6 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -315,6 +315,7 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v12, v10
@@ -322,7 +323,6 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -343,14 +343,14 @@ define <vscale x 32 x i8> @cttz_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -375,14 +375,14 @@ define <vscale x 64 x i8> @cttz_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -407,24 +407,24 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -435,28 +435,28 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv1i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
+; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i16:
@@ -474,24 +474,24 @@ define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -502,28 +502,28 @@ define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv2i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
+; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i16:
@@ -541,24 +541,24 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -569,28 +569,28 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v9
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i16:
@@ -608,24 +608,24 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -636,28 +636,28 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v10, v8, v10
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v10, v8, v10
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v10
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i16:
@@ -675,24 +675,24 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -703,28 +703,28 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v12, v8, v12
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v12, v12, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v12, v8, v12
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v12, v12, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v12
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv16i16:
@@ -742,24 +742,24 @@ define <vscale x 32 x i16> @cttz_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -781,24 +781,24 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -810,15 +810,15 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9
-; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -826,15 +826,15 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v9, v10, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i32:
@@ -852,24 +852,24 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -881,15 +881,15 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9
-; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -897,15 +897,15 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v9, v10, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i32:
@@ -923,24 +923,24 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -952,15 +952,15 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v10, v10
-; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v10, v10, a1
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -968,15 +968,15 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v10, v12, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i32:
@@ -994,24 +994,24 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1023,15 +1023,15 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v12, v12
-; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v12, v12, a1
+; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1039,15 +1039,15 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v12, v8, v12
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v12, v16, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v12, v12, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i32:
@@ -1065,24 +1065,24 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v16, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v16, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v16, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1094,15 +1094,15 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v16, v16, a1
+; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1110,15 +1110,15 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-D-NEXT:    vsrl.vi v16, v16, 23
 ; CHECK-D-NEXT:    li a1, 127
-; CHECK-D-NEXT:    vsub.vx v16, v16, a1
+; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1137,39 +1137,39 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vsub.vx v9, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vnot.v v9, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1178,37 +1178,37 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv1i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsub.vx v9, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1217,17 +1217,17 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v9
-; CHECK-F-NEXT:    vsrl.vi v9, v10, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v10, v9, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v9
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v9, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1235,16 +1235,16 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v9, v9
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v9, v8, v9
+; CHECK-D-NEXT:    vfcvt.f.xu.v v9, v9
 ; CHECK-D-NEXT:    vsrl.vx v9, v9, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v9, v9, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1263,39 +1263,39 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vsub.vx v10, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vnot.v v10, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1304,37 +1304,37 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv2i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsub.vx v10, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1343,17 +1343,17 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v10
-; CHECK-F-NEXT:    vsrl.vi v10, v12, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v12, v10, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v10
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v10, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1361,16 +1361,16 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v10, v10
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v10, v8, v10
+; CHECK-D-NEXT:    vfcvt.f.xu.v v10, v10
 ; CHECK-D-NEXT:    vsrl.vx v10, v10, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v10, v10, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1389,39 +1389,39 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vsub.vx v12, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vnot.v v12, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v12, v12, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1430,37 +1430,37 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv4i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsub.vx v12, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1469,17 +1469,17 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v12
-; CHECK-F-NEXT:    vsrl.vi v12, v16, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v16, v12, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v12
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v12, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1487,16 +1487,16 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v12, v8, v12
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v12, v12
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v12, v8, v12
+; CHECK-D-NEXT:    vfcvt.f.xu.v v12, v12
 ; CHECK-D-NEXT:    vsrl.vx v12, v12, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v12, v12, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1515,40 +1515,40 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vsub.vx v16, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vnot.v v16, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vand.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -1556,37 +1556,37 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv8i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1595,17 +1595,17 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v16
-; CHECK-F-NEXT:    vsrl.vi v16, v24, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v24, v16, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v16
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v16, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v24, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1613,16 +1613,16 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v16, v8, v16
+; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-D-NEXT:    vsrl.vx v16, v16, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v16, v16, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v16, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1641,14 +1641,14 @@ define <vscale x 1 x i8> @cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1702,14 +1702,14 @@ define <vscale x 2 x i8> @cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1763,14 +1763,14 @@ define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1824,14 +1824,14 @@ define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1885,14 +1885,14 @@ define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1946,14 +1946,14 @@ define <vscale x 32 x i8> @cttz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -1977,14 +1977,14 @@ define <vscale x 64 x i8> @cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2008,24 +2008,24 @@ define <vscale x 1 x i16> @cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2068,24 +2068,24 @@ define <vscale x 2 x i16> @cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2128,24 +2128,24 @@ define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2188,24 +2188,24 @@ define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2248,24 +2248,24 @@ define <vscale x 16 x i16> @cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2308,24 +2308,24 @@ define <vscale x 32 x i16> @cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2346,24 +2346,24 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2375,8 +2375,8 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2388,9 +2388,9 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2410,24 +2410,24 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2439,8 +2439,8 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2452,9 +2452,9 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2474,24 +2474,24 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2503,8 +2503,8 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2516,9 +2516,9 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v10
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2538,24 +2538,24 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2567,8 +2567,8 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2580,9 +2580,9 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v12
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v16, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2602,24 +2602,24 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v16, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v16, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v16, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2631,8 +2631,8 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2644,8 +2644,8 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    li a1, 127
@@ -2667,39 +2667,39 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vsub.vx v9, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vnot.v v9, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2708,37 +2708,37 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv1i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsub.vx v9, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2747,8 +2747,8 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
 ; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
@@ -2761,10 +2761,10 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v9
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
@@ -2785,39 +2785,39 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vsub.vx v10, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vnot.v v10, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2826,37 +2826,37 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv2i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsub.vx v10, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2865,8 +2865,8 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
 ; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
@@ -2879,10 +2879,10 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v10
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v10
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
@@ -2903,39 +2903,39 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vsub.vx v12, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vnot.v v12, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v12, v12, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2944,37 +2944,37 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv4i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsub.vx v12, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2983,8 +2983,8 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
 ; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
@@ -2997,10 +2997,10 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v12
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v12
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
@@ -3021,40 +3021,40 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vsub.vx v16, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vnot.v v16, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vand.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -3062,37 +3062,37 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv8i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3101,8 +3101,8 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
 ; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
@@ -3115,10 +3115,10 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v16
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 619c05dd8ab740..9e6295b6644171 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -16,13 +16,13 @@ define <vscale x 1 x i8> @vp_cttz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -46,14 +46,14 @@ define <vscale x 1 x i8> @vp_cttz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -80,13 +80,13 @@ define <vscale x 2 x i8> @vp_cttz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -110,14 +110,14 @@ define <vscale x 2 x i8> @vp_cttz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -144,13 +144,13 @@ define <vscale x 4 x i8> @vp_cttz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -174,14 +174,14 @@ define <vscale x 4 x i8> @vp_cttz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -208,13 +208,13 @@ define <vscale x 8 x i8> @vp_cttz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -238,14 +238,14 @@ define <vscale x 8 x i8> @vp_cttz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -272,13 +272,13 @@ define <vscale x 16 x i8> @vp_cttz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -302,14 +302,14 @@ define <vscale x 16 x i8> @vp_cttz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -336,13 +336,13 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -366,14 +366,14 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -400,13 +400,13 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -430,14 +430,14 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -464,23 +464,23 @@ define <vscale x 1 x i16> @vp_cttz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -501,24 +501,24 @@ define <vscale x 1 x i16> @vp_cttz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -542,23 +542,23 @@ define <vscale x 2 x i16> @vp_cttz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -579,24 +579,24 @@ define <vscale x 2 x i16> @vp_cttz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -620,23 +620,23 @@ define <vscale x 4 x i16> @vp_cttz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -657,24 +657,24 @@ define <vscale x 4 x i16> @vp_cttz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -698,23 +698,23 @@ define <vscale x 8 x i16> @vp_cttz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -735,24 +735,24 @@ define <vscale x 8 x i16> @vp_cttz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -776,23 +776,23 @@ define <vscale x 16 x i16> @vp_cttz_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -813,24 +813,24 @@ define <vscale x 16 x i16> @vp_cttz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -854,23 +854,23 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -891,24 +891,24 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -932,23 +932,23 @@ define <vscale x 1 x i32> @vp_cttz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -970,24 +970,24 @@ define <vscale x 1 x i32> @vp_cttz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1012,23 +1012,23 @@ define <vscale x 2 x i32> @vp_cttz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1050,24 +1050,24 @@ define <vscale x 2 x i32> @vp_cttz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1092,23 +1092,23 @@ define <vscale x 4 x i32> @vp_cttz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1130,24 +1130,24 @@ define <vscale x 4 x i32> @vp_cttz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1172,23 +1172,23 @@ define <vscale x 8 x i32> @vp_cttz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1210,24 +1210,24 @@ define <vscale x 8 x i32> @vp_cttz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1252,23 +1252,23 @@ define <vscale x 16 x i32> @vp_cttz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1290,24 +1290,24 @@ define <vscale x 16 x i32> @vp_cttz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1332,78 +1332,78 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1421,39 +1421,39 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vnot.v v9, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1462,37 +1462,37 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1514,78 +1514,78 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1603,39 +1603,39 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vnot.v v10, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1644,37 +1644,37 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1696,78 +1696,78 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1785,39 +1785,39 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vnot.v v12, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1826,37 +1826,37 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1878,78 +1878,78 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv7i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1967,40 +1967,40 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vnot.v v16, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
@@ -2008,37 +2008,37 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2060,78 +2060,78 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -2149,40 +2149,40 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vnot.v v16, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
@@ -2190,37 +2190,37 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2253,37 +2253,44 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a2, a1, 3
-; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    sub a4, a0, a1
+; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a3
+; RV32-NEXT:    sltu a3, a0, a4
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    and a3, a3, a4
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    li a6, 48
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -2293,22 +2300,21 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 209715
 ; RV32-NEXT:    addi a4, a4, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
@@ -2388,11 +2394,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
@@ -2500,47 +2506,47 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a2, a1, 3
-; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    li a2, 1
-; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    addiw a3, a3, 1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    srli a7, a1, 3
+; RV64-NEXT:    sub t0, a0, a1
+; RV64-NEXT:    addiw a3, a3, 1365
 ; RV64-NEXT:    addiw a4, a4, 819
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v16, a4, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw t1, a6, 257
+; RV64-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a7
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a7, a3, a7
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a6, a4, a6
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    slli a4, t1, 32
+; RV64-NEXT:    add a4, t1, a4
+; RV64-NEXT:    sltu a5, a0, t0
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    and t0, a5, t0
+; RV64-NEXT:    li a5, 56
+; RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    addiw a5, a5, -241
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addiw a6, a6, 257
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    addi t0, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
 ; RV64-NEXT:    bltu a0, a1, .LBB46_2
@@ -2557,17 +2563,17 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2583,12 +2589,12 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
@@ -2613,31 +2619,30 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    sub a5, a0, a1
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    addi a4, a4, 819
+; RV32-NEXT:    vsetvli a6, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a3
+; RV32-NEXT:    sltu a3, a0, a5
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    and a3, a3, a5
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v24, v16, a2
 ; RV32-NEXT:    vnot.v v16, v16
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    li a6, 24
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v0, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v16, v16, v24
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2653,8 +2658,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    lui a4, 61681
+; RV32-NEXT:    lui a5, 4112
 ; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a5, a5, 257
+; RV32-NEXT:    vsetvli a6, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 3
@@ -2663,10 +2670,8 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a5
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2722,42 +2727,42 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV64-LABEL: vp_cttz_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    li a2, 1
-; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    sub a7, a0, a1
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw t0, a5, -241
+; RV64-NEXT:    addiw t1, a6, 257
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a6, a3, a6
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a5, a4, a5
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
+; RV64-NEXT:    slli a4, t1, 32
+; RV64-NEXT:    add a4, t1, a4
+; RV64-NEXT:    sltu t0, a0, a7
+; RV64-NEXT:    addi t0, t0, -1
+; RV64-NEXT:    and a7, t0, a7
+; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v24, v16, a2
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    addiw a3, a3, 1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vand.vx v24, v24, a6
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    addiw a4, a4, 819
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v24, v16, a4
+; RV64-NEXT:    vand.vx v24, v16, a5
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vand.vx v16, v16, a5
 ; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    addiw a5, a5, -241
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addiw a6, a6, 257
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vmul.vx v16, v16, a4
 ; RV64-NEXT:    li a7, 56
 ; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    bltu a0, a1, .LBB47_2
@@ -2769,16 +2774,16 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vand.vx v24, v24, a6
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vand.vx v24, v8, a5
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a5
 ; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v8, v8, a5
-; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    vsrl.vx v8, v8, a7
 ; RV64-NEXT:    ret
 ;
@@ -2807,6 +2812,7 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
@@ -2817,7 +2823,6 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2861,6 +2866,7 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
@@ -2871,7 +2877,6 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2915,6 +2920,7 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
@@ -2925,7 +2931,6 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2969,6 +2974,7 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
@@ -2979,7 +2985,6 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3023,6 +3028,7 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
@@ -3033,7 +3039,6 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3078,13 +3083,13 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -3108,14 +3113,14 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -3141,13 +3146,13 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -3171,14 +3176,14 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -3203,13 +3208,13 @@ define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3249,13 +3254,13 @@ define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3295,13 +3300,13 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3341,13 +3346,13 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3387,13 +3392,13 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3434,23 +3439,23 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3471,24 +3476,24 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3510,14 +3515,14 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3535,9 +3540,9 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3558,14 +3563,14 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3583,9 +3588,9 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v10, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3606,14 +3611,14 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3631,9 +3636,9 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v12, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3654,14 +3659,14 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3679,9 +3684,9 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v16, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3702,11 +3707,11 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    li a1, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -3725,8 +3730,8 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    li a1, 127
@@ -3749,10 +3754,10 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3773,10 +3778,10 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3798,10 +3803,10 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3822,10 +3827,10 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3847,10 +3852,10 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3871,10 +3876,10 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3896,10 +3901,10 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3920,10 +3925,10 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3945,10 +3950,10 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3969,10 +3974,10 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -4004,20 +4009,20 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a4, a2, a4
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
 ; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    li a3, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
@@ -4034,8 +4039,8 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
@@ -4055,12 +4060,12 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB94_2
@@ -4079,17 +4084,17 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a4, a4, a2
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vand.vv v16, v16, v24
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    li a3, 1023
 ; CHECK-NEXT:    vsub.vx v16, v16, a3
@@ -4161,13 +4166,13 @@ define <vscale x 1 x i9> @vp_zero_undef_cttz_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
index 9e466820d83fe5..31fa5d025156f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
@@ -105,17 +105,17 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; NO-SINK-NEXT:  .LBB1_3: # %vector.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NO-SINK-NEXT:    vl2re32.v v10, (a6)
+; NO-SINK-NEXT:    sub a7, a7, a3
 ; NO-SINK-NEXT:    vadd.vv v10, v10, v8
 ; NO-SINK-NEXT:    vs2r.v v10, (a6)
-; NO-SINK-NEXT:    sub a7, a7, a3
 ; NO-SINK-NEXT:    add a6, a6, a5
 ; NO-SINK-NEXT:    bnez a7, .LBB1_3
 ; NO-SINK-NEXT:  # %bb.4: # %middle.block
 ; NO-SINK-NEXT:    beqz a4, .LBB1_7
 ; NO-SINK-NEXT:  .LBB1_5: # %for.body.preheader
 ; NO-SINK-NEXT:    slli a2, a2, 2
-; NO-SINK-NEXT:    add a2, a0, a2
 ; NO-SINK-NEXT:    lui a3, 1
+; NO-SINK-NEXT:    add a2, a0, a2
 ; NO-SINK-NEXT:    add a0, a0, a3
 ; NO-SINK-NEXT:  .LBB1_6: # %for.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -147,17 +147,17 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; SINK-NEXT:  .LBB1_3: # %vector.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SINK-NEXT:    vl2re32.v v8, (a6)
+; SINK-NEXT:    sub a7, a7, a3
 ; SINK-NEXT:    vadd.vx v8, v8, a1
 ; SINK-NEXT:    vs2r.v v8, (a6)
-; SINK-NEXT:    sub a7, a7, a3
 ; SINK-NEXT:    add a6, a6, a5
 ; SINK-NEXT:    bnez a7, .LBB1_3
 ; SINK-NEXT:  # %bb.4: # %middle.block
 ; SINK-NEXT:    beqz a4, .LBB1_7
 ; SINK-NEXT:  .LBB1_5: # %for.body.preheader
 ; SINK-NEXT:    slli a2, a2, 2
-; SINK-NEXT:    add a2, a0, a2
 ; SINK-NEXT:    lui a3, 1
+; SINK-NEXT:    add a2, a0, a2
 ; SINK-NEXT:    add a0, a0, a3
 ; SINK-NEXT:  .LBB1_6: # %for.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -189,17 +189,17 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; DEFAULT-NEXT:  .LBB1_3: # %vector.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
 ; DEFAULT-NEXT:    vl2re32.v v8, (a6)
+; DEFAULT-NEXT:    sub a7, a7, a3
 ; DEFAULT-NEXT:    vadd.vx v8, v8, a1
 ; DEFAULT-NEXT:    vs2r.v v8, (a6)
-; DEFAULT-NEXT:    sub a7, a7, a3
 ; DEFAULT-NEXT:    add a6, a6, a5
 ; DEFAULT-NEXT:    bnez a7, .LBB1_3
 ; DEFAULT-NEXT:  # %bb.4: # %middle.block
 ; DEFAULT-NEXT:    beqz a4, .LBB1_7
 ; DEFAULT-NEXT:  .LBB1_5: # %for.body.preheader
 ; DEFAULT-NEXT:    slli a2, a2, 2
-; DEFAULT-NEXT:    add a2, a0, a2
 ; DEFAULT-NEXT:    lui a3, 1
+; DEFAULT-NEXT:    add a2, a0, a2
 ; DEFAULT-NEXT:    add a0, a0, a3
 ; DEFAULT-NEXT:  .LBB1_6: # %for.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -424,17 +424,17 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; NO-SINK-NEXT:  .LBB4_3: # %vector.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NO-SINK-NEXT:    vl1re32.v v9, (a5)
+; NO-SINK-NEXT:    sub a6, a6, a3
 ; NO-SINK-NEXT:    vfadd.vv v9, v9, v8
 ; NO-SINK-NEXT:    vs1r.v v9, (a5)
-; NO-SINK-NEXT:    sub a6, a6, a3
 ; NO-SINK-NEXT:    add a5, a5, a1
 ; NO-SINK-NEXT:    bnez a6, .LBB4_3
 ; NO-SINK-NEXT:  # %bb.4: # %middle.block
 ; NO-SINK-NEXT:    beqz a4, .LBB4_7
 ; NO-SINK-NEXT:  .LBB4_5: # %for.body.preheader
 ; NO-SINK-NEXT:    slli a1, a2, 2
-; NO-SINK-NEXT:    add a1, a0, a1
 ; NO-SINK-NEXT:    lui a2, 1
+; NO-SINK-NEXT:    add a1, a0, a1
 ; NO-SINK-NEXT:    add a0, a0, a2
 ; NO-SINK-NEXT:  .LBB4_6: # %for.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -465,17 +465,17 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; SINK-NEXT:  .LBB4_3: # %vector.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SINK-NEXT:    vl1re32.v v8, (a5)
+; SINK-NEXT:    sub a6, a6, a3
 ; SINK-NEXT:    vfadd.vf v8, v8, fa0
 ; SINK-NEXT:    vs1r.v v8, (a5)
-; SINK-NEXT:    sub a6, a6, a3
 ; SINK-NEXT:    add a5, a5, a1
 ; SINK-NEXT:    bnez a6, .LBB4_3
 ; SINK-NEXT:  # %bb.4: # %middle.block
 ; SINK-NEXT:    beqz a4, .LBB4_7
 ; SINK-NEXT:  .LBB4_5: # %for.body.preheader
 ; SINK-NEXT:    slli a1, a2, 2
-; SINK-NEXT:    add a1, a0, a1
 ; SINK-NEXT:    lui a2, 1
+; SINK-NEXT:    add a1, a0, a1
 ; SINK-NEXT:    add a0, a0, a2
 ; SINK-NEXT:  .LBB4_6: # %for.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -506,17 +506,17 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; DEFAULT-NEXT:  .LBB4_3: # %vector.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
 ; DEFAULT-NEXT:    vl1re32.v v8, (a5)
+; DEFAULT-NEXT:    sub a6, a6, a3
 ; DEFAULT-NEXT:    vfadd.vf v8, v8, fa0
 ; DEFAULT-NEXT:    vs1r.v v8, (a5)
-; DEFAULT-NEXT:    sub a6, a6, a3
 ; DEFAULT-NEXT:    add a5, a5, a1
 ; DEFAULT-NEXT:    bnez a6, .LBB4_3
 ; DEFAULT-NEXT:  # %bb.4: # %middle.block
 ; DEFAULT-NEXT:    beqz a4, .LBB4_7
 ; DEFAULT-NEXT:  .LBB4_5: # %for.body.preheader
 ; DEFAULT-NEXT:    slli a1, a2, 2
-; DEFAULT-NEXT:    add a1, a0, a1
 ; DEFAULT-NEXT:    lui a2, 1
+; DEFAULT-NEXT:    add a1, a0, a1
 ; DEFAULT-NEXT:    add a0, a0, a2
 ; DEFAULT-NEXT:  .LBB4_6: # %for.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index 92b88054a1d3bc..5b82b27a51510b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -18,24 +18,24 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    lw a2, 12(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
-; RV32-NEXT:    xori t3, t3, 1
 ; RV32-NEXT:    sltiu t4, a3, 4
-; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    sltiu a3, a3, 2
+; RV32-NEXT:    xori t3, t3, 1
+; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    xori a3, a3, 1
 ; RV32-NEXT:    and a3, a3, t0
 ; RV32-NEXT:    and a2, t4, a2
 ; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
-; RV32-NEXT:    and a4, a7, a4
-; RV32-NEXT:    neg a7, t0
-; RV32-NEXT:    and a6, a7, a6
+; RV32-NEXT:    neg t0, t0
 ; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a4, a7, a4
+; RV32-NEXT:    and a6, t0, a6
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    neg a2, a3
-; RV32-NEXT:    and a2, a2, a5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    and a3, a3, a5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a4, a4, a6
 ; RV32-NEXT:    add a1, a4, a1
 ; RV32-NEXT:    add a0, a1, a0
@@ -54,24 +54,24 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
 ; RV64-NEXT:    sltiu t3, a3, 3
-; RV64-NEXT:    xori t3, t3, 1
 ; RV64-NEXT:    sltiu t4, a3, 4
-; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    sltiu a3, a3, 2
+; RV64-NEXT:    xori t3, t3, 1
+; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    xori a3, a3, 1
 ; RV64-NEXT:    and a3, a3, t0
 ; RV64-NEXT:    and a2, t4, a2
 ; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
-; RV64-NEXT:    and a4, a7, a4
-; RV64-NEXT:    negw a7, t0
-; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    negw t0, t0
 ; RV64-NEXT:    negw a2, a2
+; RV64-NEXT:    negw a3, a3
+; RV64-NEXT:    and a4, a7, a4
+; RV64-NEXT:    and a6, t0, a6
 ; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    negw a2, a3
-; RV64-NEXT:    and a2, a2, a5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    and a3, a3, a5
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    addw a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index 63da328f811d88..f1fcaed2762ae7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -229,38 +229,40 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV32-NEXT:    li a2, 128
+; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV32-NEXT:    li a3, 32
+; CHECK-RV32-NEXT:    vmv.x.s a4, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vle8.v v8, (a1)
+; CHECK-RV32-NEXT:    vle8.v v16, (a1)
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    slli a1, a1, 3
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v9, v0, 1
-; CHECK-RV32-NEXT:    li a1, 32
-; CHECK-RV32-NEXT:    vsrl.vx v10, v9, a1
-; CHECK-RV32-NEXT:    vmv.x.s a3, v10
-; CHECK-RV32-NEXT:    vsrl.vx v10, v0, a1
-; CHECK-RV32-NEXT:    vmv.x.s a1, v10
-; CHECK-RV32-NEXT:    vmv.x.s a4, v9
-; CHECK-RV32-NEXT:    vmv.x.s a5, v0
+; CHECK-RV32-NEXT:    vsrl.vx v10, v9, a3
+; CHECK-RV32-NEXT:    vsrl.vx v11, v0, a3
+; CHECK-RV32-NEXT:    vmv.x.s a1, v9
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a6, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a6, e8, m8, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v0
+; CHECK-RV32-NEXT:    cpop a4, a4
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a5, v10
+; CHECK-RV32-NEXT:    vmv.x.s a6, v11
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle8.v v8, (a0)
-; CHECK-RV32-NEXT:    csrr a6, vlenb
-; CHECK-RV32-NEXT:    slli a6, a6, 4
-; CHECK-RV32-NEXT:    add a6, sp, a6
-; CHECK-RV32-NEXT:    addi a6, a6, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    slli a3, a3, 4
+; CHECK-RV32-NEXT:    add a3, sp, a3
+; CHECK-RV32-NEXT:    addi a3, a3, 16
+; CHECK-RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    cpop a1, a1
+; CHECK-RV32-NEXT:    cpop a3, a6
 ; CHECK-RV32-NEXT:    cpop a5, a5
-; CHECK-RV32-NEXT:    add a1, a5, a1
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    cpop a4, a4
 ; CHECK-RV32-NEXT:    add a3, a4, a3
-; CHECK-RV32-NEXT:    add a1, a1, a3
+; CHECK-RV32-NEXT:    add a1, a1, a5
+; CHECK-RV32-NEXT:    add a1, a3, a1
 ; CHECK-RV32-NEXT:    add a0, a0, a1
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v7
@@ -338,17 +340,18 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV64-NEXT:    li a2, 128
+; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-RV64-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV64-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV64-NEXT:    vle8.v v8, (a1)
+; CHECK-RV64-NEXT:    vle8.v v16, (a1)
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
 ; CHECK-RV64-NEXT:    slli a1, a1, 3
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vmv.x.s a1, v9
-; CHECK-RV64-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a4, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
@@ -429,23 +432,23 @@ define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passth
 ; CHECK-RV32-LABEL: test_expandload_v256i8_all_ones:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    li a1, 128
+; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vmset.m v8
-; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a2
-; CHECK-RV32-NEXT:    vmv.x.s a3, v9
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    vmv.x.s a4, v8
-; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    add a3, a4, a3
+; CHECK-RV32-NEXT:    vmv.x.s a3, v8
 ; CHECK-RV32-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-RV32-NEXT:    vmv.x.s a4, v9
+; CHECK-RV32-NEXT:    cpop a3, a3
 ; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a2
-; CHECK-RV32-NEXT:    vmv.x.s a2, v9
+; CHECK-RV32-NEXT:    vmv.x.s a2, v8
+; CHECK-RV32-NEXT:    cpop a4, a4
+; CHECK-RV32-NEXT:    add a3, a3, a4
+; CHECK-RV32-NEXT:    vmv.x.s a4, v9
 ; CHECK-RV32-NEXT:    cpop a2, a2
-; CHECK-RV32-NEXT:    vmv.x.s a4, v8
 ; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    add a2, a4, a2
+; CHECK-RV32-NEXT:    add a2, a2, a4
 ; CHECK-RV32-NEXT:    add a3, a0, a3
 ; CHECK-RV32-NEXT:    add a2, a3, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -461,8 +464,8 @@ define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passth
 ; CHECK-RV64-NEXT:    vmset.m v16
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vmv.x.s a2, v16
-; CHECK-RV64-NEXT:    cpop a2, a2
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v16, 1
+; CHECK-RV64-NEXT:    cpop a2, a2
 ; CHECK-RV64-NEXT:    vmv.x.s a3, v16
 ; CHECK-RV64-NEXT:    cpop a3, a3
 ; CHECK-RV64-NEXT:    add a0, a0, a2
@@ -665,33 +668,35 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
 ; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 4
+; CHECK-RV32-NEXT:    li a2, 24
+; CHECK-RV32-NEXT:    mul a1, a1, a2
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    li a1, 64
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v24, (a0)
-; CHECK-RV32-NEXT:    csrr a2, vlenb
-; CHECK-RV32-NEXT:    li a3, 24
-; CHECK-RV32-NEXT:    mul a2, a2, a3
-; CHECK-RV32-NEXT:    add a2, sp, a2
-; CHECK-RV32-NEXT:    addi a2, a2, 16
-; CHECK-RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 8
+; CHECK-RV32-NEXT:    li a2, 32
+; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v7
-; CHECK-RV32-NEXT:    li a3, 32
+; CHECK-RV32-NEXT:    vcpop.m a4, v0
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v25, v0, a3
-; CHECK-RV32-NEXT:    vmv.x.s a3, v25
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    vmv.x.s a4, v0
+; CHECK-RV32-NEXT:    vsrl.vx v25, v0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a2, v7
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
+; CHECK-RV32-NEXT:    vle16.v v16, (a0)
+; CHECK-RV32-NEXT:    csrr a5, vlenb
+; CHECK-RV32-NEXT:    slli a5, a5, 4
+; CHECK-RV32-NEXT:    add a5, sp, a5
+; CHECK-RV32-NEXT:    addi a5, a5, 16
+; CHECK-RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a4, v25
 ; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    add a3, a4, a3
+; CHECK-RV32-NEXT:    cpop a3, a3
+; CHECK-RV32-NEXT:    add a3, a3, a4
 ; CHECK-RV32-NEXT:    slli a3, a3, 1
 ; CHECK-RV32-NEXT:    add a0, a0, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
@@ -702,20 +707,20 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v24, v0
+; CHECK-RV32-NEXT:    viota.m v16, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    li a1, 24
+; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -749,26 +754,26 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    li a1, 64
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v0
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV64-NEXT:    vle16.v v24, (a0)
-; CHECK-RV64-NEXT:    csrr a2, vlenb
-; CHECK-RV64-NEXT:    li a3, 24
-; CHECK-RV64-NEXT:    mul a2, a2, a3
-; CHECK-RV64-NEXT:    add a2, sp, a2
-; CHECK-RV64-NEXT:    addi a2, a2, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 8
+; CHECK-RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vmv.x.s a2, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v7
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a3, v0
-; CHECK-RV64-NEXT:    cpop a3, a3
-; CHECK-RV64-NEXT:    slli a3, a3, 1
-; CHECK-RV64-NEXT:    add a0, a0, a3
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
+; CHECK-RV64-NEXT:    vle16.v v24, (a0)
+; CHECK-RV64-NEXT:    csrr a3, vlenb
+; CHECK-RV64-NEXT:    li a4, 24
+; CHECK-RV64-NEXT:    mul a3, a3, a4
+; CHECK-RV64-NEXT:    add a3, sp, a3
+; CHECK-RV64-NEXT:    addi a3, a3, 16
+; CHECK-RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v7
+; CHECK-RV64-NEXT:    cpop a2, a2
+; CHECK-RV64-NEXT:    slli a2, a2, 1
+; CHECK-RV64-NEXT:    add a0, a0, a2
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle16.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 3
@@ -786,13 +791,7 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
@@ -804,12 +803,6 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -828,17 +821,17 @@ define <128 x i16> @test_expandload_v128i16_all_ones(ptr %base, <128 x i16> %pas
 ; CHECK-RV32-LABEL: test_expandload_v128i16_all_ones:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    li a1, 64
+; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle16.v v8, (a0)
 ; CHECK-RV32-NEXT:    vmset.m v16
-; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v17, v16, a2
-; CHECK-RV32-NEXT:    vmv.x.s a2, v17
-; CHECK-RV32-NEXT:    cpop a2, a2
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v17
 ; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    add a2, a3, a2
+; CHECK-RV32-NEXT:    cpop a2, a2
+; CHECK-RV32-NEXT:    add a2, a2, a3
 ; CHECK-RV32-NEXT:    slli a2, a2, 1
 ; CHECK-RV32-NEXT:    add a0, a0, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1032,25 +1025,26 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    li a1, 32
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-RV32-NEXT:    vle32.v v24, (a0)
-; CHECK-RV32-NEXT:    csrr a2, vlenb
-; CHECK-RV32-NEXT:    li a3, 24
-; CHECK-RV32-NEXT:    mul a2, a2, a3
-; CHECK-RV32-NEXT:    add a2, sp, a2
-; CHECK-RV32-NEXT:    addi a2, a2, 16
-; CHECK-RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 4
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v7
-; CHECK-RV32-NEXT:    vmv.x.s a3, v0
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    slli a3, a3, 2
-; CHECK-RV32-NEXT:    add a0, a0, a3
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a2, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-RV32-NEXT:    vle32.v v24, (a0)
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    li a4, 24
+; CHECK-RV32-NEXT:    mul a3, a3, a4
+; CHECK-RV32-NEXT:    add a3, sp, a3
+; CHECK-RV32-NEXT:    addi a3, a3, 16
+; CHECK-RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v7
+; CHECK-RV32-NEXT:    cpop a2, a2
+; CHECK-RV32-NEXT:    slli a2, a2, 2
+; CHECK-RV32-NEXT:    add a0, a0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle32.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
@@ -1068,13 +1062,7 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
@@ -1086,12 +1074,6 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -1117,25 +1099,26 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    li a1, 32
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v0
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-RV64-NEXT:    vle32.v v24, (a0)
-; CHECK-RV64-NEXT:    csrr a2, vlenb
-; CHECK-RV64-NEXT:    li a3, 24
-; CHECK-RV64-NEXT:    mul a2, a2, a3
-; CHECK-RV64-NEXT:    add a2, sp, a2
-; CHECK-RV64-NEXT:    addi a2, a2, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 4
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v7
-; CHECK-RV64-NEXT:    vmv.x.s a3, v0
-; CHECK-RV64-NEXT:    cpopw a3, a3
-; CHECK-RV64-NEXT:    slli a3, a3, 2
-; CHECK-RV64-NEXT:    add a0, a0, a3
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-RV64-NEXT:    vmv.x.s a2, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-RV64-NEXT:    vle32.v v24, (a0)
+; CHECK-RV64-NEXT:    csrr a3, vlenb
+; CHECK-RV64-NEXT:    li a4, 24
+; CHECK-RV64-NEXT:    mul a3, a3, a4
+; CHECK-RV64-NEXT:    add a3, sp, a3
+; CHECK-RV64-NEXT:    addi a3, a3, 16
+; CHECK-RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v7
+; CHECK-RV64-NEXT:    cpopw a2, a2
+; CHECK-RV64-NEXT:    slli a2, a2, 2
+; CHECK-RV64-NEXT:    add a0, a0, a2
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle32.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 3
@@ -1153,13 +1136,7 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
@@ -1171,12 +1148,6 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -1369,12 +1340,12 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vmv.x.s a1, v0
+; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV32-NEXT:    zext.h a1, a1
 ; CHECK-RV32-NEXT:    cpop a1, a1
 ; CHECK-RV32-NEXT:    slli a1, a1, 3
 ; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v7
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1395,13 +1366,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
@@ -1413,12 +1378,6 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -1454,12 +1413,12 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vmv.x.s a1, v0
+; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV64-NEXT:    zext.h a1, a1
 ; CHECK-RV64-NEXT:    cpopw a1, a1
 ; CHECK-RV64-NEXT:    slli a1, a1, 3
 ; CHECK-RV64-NEXT:    add a0, a0, a1
-; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a1, v7
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1480,13 +1439,7 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
@@ -1498,12 +1451,6 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -1671,26 +1618,26 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a1, .LBB61_30
 ; CHECK-RV32-NEXT:  .LBB61_29: # %cond.load109
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 29, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 28
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 28
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_30: # %else110
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    li a1, 32
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_32
 ; CHECK-RV32-NEXT:  # %bb.31: # %cond.load113
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 30, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a2
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 29
+; CHECK-RV32-NEXT:    vmv.s.x v9, a2
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 29
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_32: # %else114
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -1698,10 +1645,10 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_34
 ; CHECK-RV32-NEXT:  # %bb.33: # %cond.load117
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v17, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    vsetivli zero, 31, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vi v8, v17, 30
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 30
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -1832,13 +1779,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_65: # %cond.load241
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 62
 ; CHECK-RV32-NEXT:    li a4, 61
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -1849,12 +1796,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_68
 ; CHECK-RV32-NEXT:  # %bb.67: # %cond.load245
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v17, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 63
 ; CHECK-RV32-NEXT:    li a4, 62
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v17, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -1985,13 +1932,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_99: # %cond.load369
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 94
 ; CHECK-RV32-NEXT:    li a4, 93
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2002,12 +1949,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_102
 ; CHECK-RV32-NEXT:  # %bb.101: # %cond.load373
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 95
 ; CHECK-RV32-NEXT:    li a4, 94
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2138,13 +2085,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_133: # %cond.load497
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 126
 ; CHECK-RV32-NEXT:    li a4, 125
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -2155,12 +2102,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_136
 ; CHECK-RV32-NEXT:  # %bb.135: # %cond.load501
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v18, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 127
 ; CHECK-RV32-NEXT:    li a4, 126
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2291,13 +2238,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_167: # %cond.load625
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 158
 ; CHECK-RV32-NEXT:    li a4, 157
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2308,12 +2255,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_170
 ; CHECK-RV32-NEXT:  # %bb.169: # %cond.load629
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 159
 ; CHECK-RV32-NEXT:    li a4, 158
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2444,16 +2391,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_201: # %cond.load753
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 190
 ; CHECK-RV32-NEXT:    li a4, 189
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_202: # %else754
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2461,12 +2408,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_204
 ; CHECK-RV32-NEXT:  # %bb.203: # %cond.load757
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 191
 ; CHECK-RV32-NEXT:    li a4, 190
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2597,13 +2544,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_235: # %cond.load881
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 222
 ; CHECK-RV32-NEXT:    li a4, 221
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2614,12 +2561,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_238
 ; CHECK-RV32-NEXT:  # %bb.237: # %cond.load885
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 223
 ; CHECK-RV32-NEXT:    li a4, 222
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2750,16 +2697,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_269: # %cond.load1009
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 254
 ; CHECK-RV32-NEXT:    li a4, 253
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_270: # %else1010
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2767,12 +2714,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_272
 ; CHECK-RV32-NEXT:  # %bb.271: # %cond.load1013
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 255
 ; CHECK-RV32-NEXT:    li a4, 254
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -3952,326 +3899,326 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_2
 ; CHECK-RV32-NEXT:  .LBB61_545: # %cond.load1
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 4
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_546
 ; CHECK-RV32-NEXT:    j .LBB61_3
 ; CHECK-RV32-NEXT:  .LBB61_546: # %cond.load5
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 2
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 8
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_547
 ; CHECK-RV32-NEXT:    j .LBB61_4
 ; CHECK-RV32-NEXT:  .LBB61_547: # %cond.load9
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 3
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 16
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_548
 ; CHECK-RV32-NEXT:    j .LBB61_5
 ; CHECK-RV32-NEXT:  .LBB61_548: # %cond.load13
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 5, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 32
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_549
 ; CHECK-RV32-NEXT:    j .LBB61_6
 ; CHECK-RV32-NEXT:  .LBB61_549: # %cond.load17
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 5
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 5
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 64
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_550
 ; CHECK-RV32-NEXT:    j .LBB61_7
 ; CHECK-RV32-NEXT:  .LBB61_550: # %cond.load21
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 7, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 6
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 6
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 128
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_551
 ; CHECK-RV32-NEXT:    j .LBB61_8
 ; CHECK-RV32-NEXT:  .LBB61_551: # %cond.load25
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 7
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 7
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 256
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_552
 ; CHECK-RV32-NEXT:    j .LBB61_9
 ; CHECK-RV32-NEXT:  .LBB61_552: # %cond.load29
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 9, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 512
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_553
 ; CHECK-RV32-NEXT:    j .LBB61_10
 ; CHECK-RV32-NEXT:  .LBB61_553: # %cond.load33
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 10, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 9
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 9
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_554
 ; CHECK-RV32-NEXT:    j .LBB61_11
 ; CHECK-RV32-NEXT:  .LBB61_554: # %cond.load37
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 11, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 10
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 10
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 20
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_555
 ; CHECK-RV32-NEXT:    j .LBB61_12
 ; CHECK-RV32-NEXT:  .LBB61_555: # %cond.load41
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 11
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 11
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 19
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_556
 ; CHECK-RV32-NEXT:    j .LBB61_13
 ; CHECK-RV32-NEXT:  .LBB61_556: # %cond.load45
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 13, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 12
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 12
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 18
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_557
 ; CHECK-RV32-NEXT:    j .LBB61_14
 ; CHECK-RV32-NEXT:  .LBB61_557: # %cond.load49
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 13
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 13
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 17
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_558
 ; CHECK-RV32-NEXT:    j .LBB61_15
 ; CHECK-RV32-NEXT:  .LBB61_558: # %cond.load53
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 15, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 14
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 14
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 16
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_559
 ; CHECK-RV32-NEXT:    j .LBB61_16
 ; CHECK-RV32-NEXT:  .LBB61_559: # %cond.load57
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 15
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 15
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 15
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_560
 ; CHECK-RV32-NEXT:    j .LBB61_17
 ; CHECK-RV32-NEXT:  .LBB61_560: # %cond.load61
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 17, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 14
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_561
 ; CHECK-RV32-NEXT:    j .LBB61_18
 ; CHECK-RV32-NEXT:  .LBB61_561: # %cond.load65
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 18, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 17
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 17
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 13
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_562
 ; CHECK-RV32-NEXT:    j .LBB61_19
 ; CHECK-RV32-NEXT:  .LBB61_562: # %cond.load69
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 19, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 18
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 18
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 12
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_563
 ; CHECK-RV32-NEXT:    j .LBB61_20
 ; CHECK-RV32-NEXT:  .LBB61_563: # %cond.load73
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 20, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 19
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 19
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 11
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_564
 ; CHECK-RV32-NEXT:    j .LBB61_21
 ; CHECK-RV32-NEXT:  .LBB61_564: # %cond.load77
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 21, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 20
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 20
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 10
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_565
 ; CHECK-RV32-NEXT:    j .LBB61_22
 ; CHECK-RV32-NEXT:  .LBB61_565: # %cond.load81
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 22, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 21
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 21
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 9
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_566
 ; CHECK-RV32-NEXT:    j .LBB61_23
 ; CHECK-RV32-NEXT:  .LBB61_566: # %cond.load85
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 23, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 22
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 22
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 8
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_567
 ; CHECK-RV32-NEXT:    j .LBB61_24
 ; CHECK-RV32-NEXT:  .LBB61_567: # %cond.load89
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 24, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 23
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 23
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 7
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_568
 ; CHECK-RV32-NEXT:    j .LBB61_25
 ; CHECK-RV32-NEXT:  .LBB61_568: # %cond.load93
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 25, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 24
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 24
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 6
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_569
 ; CHECK-RV32-NEXT:    j .LBB61_26
 ; CHECK-RV32-NEXT:  .LBB61_569: # %cond.load97
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 26, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 25
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 25
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 5
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_570
 ; CHECK-RV32-NEXT:    j .LBB61_27
 ; CHECK-RV32-NEXT:  .LBB61_570: # %cond.load101
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 27, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 26
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 26
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 4
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_571
 ; CHECK-RV32-NEXT:    j .LBB61_28
 ; CHECK-RV32-NEXT:  .LBB61_571: # %cond.load105
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 28, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 27
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 27
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 3
 ; CHECK-RV32-NEXT:    bgez a1, .LBB61_1025
 ; CHECK-RV32-NEXT:    j .LBB61_29
@@ -4279,11 +4226,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_30
 ; CHECK-RV32-NEXT:  .LBB61_572: # %cond.load121
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vi v8, v24, 31
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 31
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4293,13 +4240,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_573: # %cond.load125
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 33
 ; CHECK-RV32-NEXT:    li a4, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4309,13 +4256,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_574: # %cond.load129
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 34
 ; CHECK-RV32-NEXT:    li a4, 33
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4325,13 +4272,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_575: # %cond.load133
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 35
 ; CHECK-RV32-NEXT:    li a4, 34
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4341,13 +4288,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_576: # %cond.load137
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 36
 ; CHECK-RV32-NEXT:    li a4, 35
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4357,13 +4304,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_577: # %cond.load141
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 37
 ; CHECK-RV32-NEXT:    li a4, 36
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4373,13 +4320,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_578: # %cond.load145
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 38
 ; CHECK-RV32-NEXT:    li a4, 37
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4389,13 +4336,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_579: # %cond.load149
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 39
 ; CHECK-RV32-NEXT:    li a4, 38
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4405,13 +4352,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_580: # %cond.load153
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 40
 ; CHECK-RV32-NEXT:    li a4, 39
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4421,13 +4368,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_581: # %cond.load157
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 41
 ; CHECK-RV32-NEXT:    li a4, 40
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4437,13 +4384,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_582: # %cond.load161
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 42
 ; CHECK-RV32-NEXT:    li a4, 41
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4453,13 +4400,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_583: # %cond.load165
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 43
 ; CHECK-RV32-NEXT:    li a4, 42
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4469,13 +4416,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_584: # %cond.load169
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 44
 ; CHECK-RV32-NEXT:    li a4, 43
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4485,13 +4432,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_585: # %cond.load173
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 45
 ; CHECK-RV32-NEXT:    li a4, 44
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4501,13 +4448,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_586: # %cond.load177
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 46
 ; CHECK-RV32-NEXT:    li a4, 45
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4517,13 +4464,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_587: # %cond.load181
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 47
 ; CHECK-RV32-NEXT:    li a4, 46
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4533,13 +4480,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_588: # %cond.load185
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 48
 ; CHECK-RV32-NEXT:    li a4, 47
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4549,13 +4496,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_589: # %cond.load189
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 49
 ; CHECK-RV32-NEXT:    li a4, 48
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4565,13 +4512,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_590: # %cond.load193
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 50
 ; CHECK-RV32-NEXT:    li a4, 49
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4581,13 +4528,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_591: # %cond.load197
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 51
 ; CHECK-RV32-NEXT:    li a4, 50
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4597,13 +4544,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_592: # %cond.load201
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 52
 ; CHECK-RV32-NEXT:    li a4, 51
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4613,13 +4560,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_593: # %cond.load205
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 53
 ; CHECK-RV32-NEXT:    li a4, 52
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4629,13 +4576,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_594: # %cond.load209
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 54
 ; CHECK-RV32-NEXT:    li a4, 53
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4645,13 +4592,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_595: # %cond.load213
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 55
 ; CHECK-RV32-NEXT:    li a4, 54
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4661,13 +4608,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_596: # %cond.load217
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 56
 ; CHECK-RV32-NEXT:    li a4, 55
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4677,13 +4624,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_597: # %cond.load221
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 57
 ; CHECK-RV32-NEXT:    li a4, 56
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4693,13 +4640,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_598: # %cond.load225
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 58
 ; CHECK-RV32-NEXT:    li a4, 57
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4709,13 +4656,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_599: # %cond.load229
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 59
 ; CHECK-RV32-NEXT:    li a4, 58
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4725,13 +4672,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_600: # %cond.load233
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 60
 ; CHECK-RV32-NEXT:    li a4, 59
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4741,13 +4688,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_601: # %cond.load237
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 61
 ; CHECK-RV32-NEXT:    li a4, 60
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4758,12 +4705,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_66
 ; CHECK-RV32-NEXT:  .LBB61_602: # %cond.load249
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v17, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    li a2, 64
 ; CHECK-RV32-NEXT:    li a4, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v17, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4773,13 +4720,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_603: # %cond.load253
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 65
 ; CHECK-RV32-NEXT:    li a4, 64
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4789,13 +4736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_604: # %cond.load257
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 66
 ; CHECK-RV32-NEXT:    li a4, 65
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4805,13 +4752,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_605: # %cond.load261
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 67
 ; CHECK-RV32-NEXT:    li a4, 66
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4821,13 +4768,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_606: # %cond.load265
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 68
 ; CHECK-RV32-NEXT:    li a4, 67
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4837,13 +4784,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_607: # %cond.load269
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 69
 ; CHECK-RV32-NEXT:    li a4, 68
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4853,13 +4800,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_608: # %cond.load273
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 70
 ; CHECK-RV32-NEXT:    li a4, 69
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4869,13 +4816,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_609: # %cond.load277
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 71
 ; CHECK-RV32-NEXT:    li a4, 70
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4885,13 +4832,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_610: # %cond.load281
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 72
 ; CHECK-RV32-NEXT:    li a4, 71
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4901,13 +4848,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_611: # %cond.load285
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 73
 ; CHECK-RV32-NEXT:    li a4, 72
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4917,13 +4864,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_612: # %cond.load289
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 74
 ; CHECK-RV32-NEXT:    li a4, 73
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4933,13 +4880,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_613: # %cond.load293
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 75
 ; CHECK-RV32-NEXT:    li a4, 74
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4949,13 +4896,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_614: # %cond.load297
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 76
 ; CHECK-RV32-NEXT:    li a4, 75
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4965,13 +4912,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_615: # %cond.load301
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 77
 ; CHECK-RV32-NEXT:    li a4, 76
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4981,13 +4928,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_616: # %cond.load305
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 78
 ; CHECK-RV32-NEXT:    li a4, 77
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4997,13 +4944,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_617: # %cond.load309
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 79
 ; CHECK-RV32-NEXT:    li a4, 78
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5013,13 +4960,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_618: # %cond.load313
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 80
 ; CHECK-RV32-NEXT:    li a4, 79
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5029,13 +4976,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_619: # %cond.load317
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 81
 ; CHECK-RV32-NEXT:    li a4, 80
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5045,13 +4992,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_620: # %cond.load321
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 82
 ; CHECK-RV32-NEXT:    li a4, 81
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5061,13 +5008,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_621: # %cond.load325
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 83
 ; CHECK-RV32-NEXT:    li a4, 82
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5077,13 +5024,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_622: # %cond.load329
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 84
 ; CHECK-RV32-NEXT:    li a4, 83
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5093,13 +5040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_623: # %cond.load333
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 85
 ; CHECK-RV32-NEXT:    li a4, 84
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5109,13 +5056,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_624: # %cond.load337
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 86
 ; CHECK-RV32-NEXT:    li a4, 85
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5125,13 +5072,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_625: # %cond.load341
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 87
 ; CHECK-RV32-NEXT:    li a4, 86
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5141,13 +5088,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_626: # %cond.load345
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 88
 ; CHECK-RV32-NEXT:    li a4, 87
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5157,13 +5104,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_627: # %cond.load349
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 89
 ; CHECK-RV32-NEXT:    li a4, 88
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5173,13 +5120,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_628: # %cond.load353
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 90
 ; CHECK-RV32-NEXT:    li a4, 89
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5189,13 +5136,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_629: # %cond.load357
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 91
 ; CHECK-RV32-NEXT:    li a4, 90
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5205,13 +5152,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_630: # %cond.load361
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 92
 ; CHECK-RV32-NEXT:    li a4, 91
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5221,13 +5168,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_631: # %cond.load365
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 93
 ; CHECK-RV32-NEXT:    li a4, 92
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5238,12 +5185,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_100
 ; CHECK-RV32-NEXT:  .LBB61_632: # %cond.load377
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 96
 ; CHECK-RV32-NEXT:    li a4, 95
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5253,13 +5200,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_633: # %cond.load381
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 97
 ; CHECK-RV32-NEXT:    li a4, 96
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5269,13 +5216,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_634: # %cond.load385
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 98
 ; CHECK-RV32-NEXT:    li a4, 97
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5285,13 +5232,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_635: # %cond.load389
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 99
 ; CHECK-RV32-NEXT:    li a4, 98
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5301,13 +5248,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_636: # %cond.load393
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 100
 ; CHECK-RV32-NEXT:    li a4, 99
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5317,13 +5264,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_637: # %cond.load397
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 101
 ; CHECK-RV32-NEXT:    li a4, 100
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5333,13 +5280,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_638: # %cond.load401
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 102
 ; CHECK-RV32-NEXT:    li a4, 101
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5349,13 +5296,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_639: # %cond.load405
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 103
 ; CHECK-RV32-NEXT:    li a4, 102
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5365,13 +5312,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_640: # %cond.load409
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 104
 ; CHECK-RV32-NEXT:    li a4, 103
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5381,13 +5328,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_641: # %cond.load413
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 105
 ; CHECK-RV32-NEXT:    li a4, 104
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5397,13 +5344,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_642: # %cond.load417
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 106
 ; CHECK-RV32-NEXT:    li a4, 105
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5413,13 +5360,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_643: # %cond.load421
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 107
 ; CHECK-RV32-NEXT:    li a4, 106
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5429,13 +5376,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_644: # %cond.load425
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 108
 ; CHECK-RV32-NEXT:    li a4, 107
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5445,13 +5392,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_645: # %cond.load429
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 109
 ; CHECK-RV32-NEXT:    li a4, 108
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5461,13 +5408,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_646: # %cond.load433
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 110
 ; CHECK-RV32-NEXT:    li a4, 109
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5477,13 +5424,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_647: # %cond.load437
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 111
 ; CHECK-RV32-NEXT:    li a4, 110
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5493,13 +5440,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_648: # %cond.load441
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 112
 ; CHECK-RV32-NEXT:    li a4, 111
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5509,13 +5456,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_649: # %cond.load445
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 113
 ; CHECK-RV32-NEXT:    li a4, 112
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5525,13 +5472,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_650: # %cond.load449
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 114
 ; CHECK-RV32-NEXT:    li a4, 113
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5541,13 +5488,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_651: # %cond.load453
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 115
 ; CHECK-RV32-NEXT:    li a4, 114
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5557,13 +5504,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_652: # %cond.load457
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 116
 ; CHECK-RV32-NEXT:    li a4, 115
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5573,13 +5520,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_653: # %cond.load461
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 117
 ; CHECK-RV32-NEXT:    li a4, 116
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5589,13 +5536,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_654: # %cond.load465
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 118
 ; CHECK-RV32-NEXT:    li a4, 117
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5605,13 +5552,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_655: # %cond.load469
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 119
 ; CHECK-RV32-NEXT:    li a4, 118
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5621,13 +5568,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_656: # %cond.load473
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 120
 ; CHECK-RV32-NEXT:    li a4, 119
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5637,13 +5584,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_657: # %cond.load477
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 121
 ; CHECK-RV32-NEXT:    li a4, 120
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5653,13 +5600,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_658: # %cond.load481
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 122
 ; CHECK-RV32-NEXT:    li a4, 121
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5669,13 +5616,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_659: # %cond.load485
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 123
 ; CHECK-RV32-NEXT:    li a4, 122
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5685,13 +5632,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_660: # %cond.load489
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 124
 ; CHECK-RV32-NEXT:    li a4, 123
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5701,13 +5648,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_661: # %cond.load493
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 125
 ; CHECK-RV32-NEXT:    li a4, 124
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5718,12 +5665,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_134
 ; CHECK-RV32-NEXT:  .LBB61_662: # %cond.load505
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 128
 ; CHECK-RV32-NEXT:    li a4, 127
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5733,13 +5680,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_663: # %cond.load509
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 129
 ; CHECK-RV32-NEXT:    li a4, 128
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5749,13 +5696,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_664: # %cond.load513
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 130
 ; CHECK-RV32-NEXT:    li a4, 129
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5765,13 +5712,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_665: # %cond.load517
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 131
 ; CHECK-RV32-NEXT:    li a4, 130
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5781,13 +5728,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_666: # %cond.load521
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 132
 ; CHECK-RV32-NEXT:    li a4, 131
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5797,13 +5744,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_667: # %cond.load525
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 133
 ; CHECK-RV32-NEXT:    li a4, 132
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5813,13 +5760,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_668: # %cond.load529
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 134
 ; CHECK-RV32-NEXT:    li a4, 133
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5829,13 +5776,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_669: # %cond.load533
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 135
 ; CHECK-RV32-NEXT:    li a4, 134
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5845,13 +5792,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_670: # %cond.load537
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 136
 ; CHECK-RV32-NEXT:    li a4, 135
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5861,13 +5808,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_671: # %cond.load541
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 137
 ; CHECK-RV32-NEXT:    li a4, 136
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5877,13 +5824,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_672: # %cond.load545
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 138
 ; CHECK-RV32-NEXT:    li a4, 137
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5893,13 +5840,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_673: # %cond.load549
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 139
 ; CHECK-RV32-NEXT:    li a4, 138
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5909,13 +5856,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_674: # %cond.load553
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 140
 ; CHECK-RV32-NEXT:    li a4, 139
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5925,13 +5872,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_675: # %cond.load557
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 141
 ; CHECK-RV32-NEXT:    li a4, 140
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5941,13 +5888,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_676: # %cond.load561
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 142
 ; CHECK-RV32-NEXT:    li a4, 141
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5957,13 +5904,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_677: # %cond.load565
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 143
 ; CHECK-RV32-NEXT:    li a4, 142
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5973,13 +5920,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_678: # %cond.load569
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 144
 ; CHECK-RV32-NEXT:    li a4, 143
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5989,13 +5936,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_679: # %cond.load573
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 145
 ; CHECK-RV32-NEXT:    li a4, 144
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6005,13 +5952,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_680: # %cond.load577
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 146
 ; CHECK-RV32-NEXT:    li a4, 145
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6021,13 +5968,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_681: # %cond.load581
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 147
 ; CHECK-RV32-NEXT:    li a4, 146
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6037,13 +5984,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_682: # %cond.load585
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 148
 ; CHECK-RV32-NEXT:    li a4, 147
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6053,13 +6000,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_683: # %cond.load589
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 149
 ; CHECK-RV32-NEXT:    li a4, 148
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6069,13 +6016,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_684: # %cond.load593
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 150
 ; CHECK-RV32-NEXT:    li a4, 149
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6085,13 +6032,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_685: # %cond.load597
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 151
 ; CHECK-RV32-NEXT:    li a4, 150
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6101,13 +6048,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_686: # %cond.load601
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 152
 ; CHECK-RV32-NEXT:    li a4, 151
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6117,13 +6064,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_687: # %cond.load605
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 153
 ; CHECK-RV32-NEXT:    li a4, 152
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6133,13 +6080,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_688: # %cond.load609
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 154
 ; CHECK-RV32-NEXT:    li a4, 153
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6149,13 +6096,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_689: # %cond.load613
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 155
 ; CHECK-RV32-NEXT:    li a4, 154
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6165,13 +6112,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_690: # %cond.load617
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 156
 ; CHECK-RV32-NEXT:    li a4, 155
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6181,13 +6128,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_691: # %cond.load621
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 157
 ; CHECK-RV32-NEXT:    li a4, 156
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6198,479 +6145,479 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_168
 ; CHECK-RV32-NEXT:  .LBB61_692: # %cond.load633
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 160
 ; CHECK-RV32-NEXT:    li a4, 159
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_693
 ; CHECK-RV32-NEXT:    j .LBB61_172
 ; CHECK-RV32-NEXT:  .LBB61_693: # %cond.load637
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 161
 ; CHECK-RV32-NEXT:    li a4, 160
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_694
 ; CHECK-RV32-NEXT:    j .LBB61_173
 ; CHECK-RV32-NEXT:  .LBB61_694: # %cond.load641
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 162
 ; CHECK-RV32-NEXT:    li a4, 161
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_695
 ; CHECK-RV32-NEXT:    j .LBB61_174
 ; CHECK-RV32-NEXT:  .LBB61_695: # %cond.load645
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 163
 ; CHECK-RV32-NEXT:    li a4, 162
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_696
 ; CHECK-RV32-NEXT:    j .LBB61_175
 ; CHECK-RV32-NEXT:  .LBB61_696: # %cond.load649
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 164
 ; CHECK-RV32-NEXT:    li a4, 163
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_697
 ; CHECK-RV32-NEXT:    j .LBB61_176
 ; CHECK-RV32-NEXT:  .LBB61_697: # %cond.load653
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 165
 ; CHECK-RV32-NEXT:    li a4, 164
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_698
 ; CHECK-RV32-NEXT:    j .LBB61_177
 ; CHECK-RV32-NEXT:  .LBB61_698: # %cond.load657
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 166
 ; CHECK-RV32-NEXT:    li a4, 165
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_699
 ; CHECK-RV32-NEXT:    j .LBB61_178
 ; CHECK-RV32-NEXT:  .LBB61_699: # %cond.load661
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 167
 ; CHECK-RV32-NEXT:    li a4, 166
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_700
 ; CHECK-RV32-NEXT:    j .LBB61_179
 ; CHECK-RV32-NEXT:  .LBB61_700: # %cond.load665
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 168
 ; CHECK-RV32-NEXT:    li a4, 167
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_701
 ; CHECK-RV32-NEXT:    j .LBB61_180
 ; CHECK-RV32-NEXT:  .LBB61_701: # %cond.load669
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 169
 ; CHECK-RV32-NEXT:    li a4, 168
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_702
 ; CHECK-RV32-NEXT:    j .LBB61_181
 ; CHECK-RV32-NEXT:  .LBB61_702: # %cond.load673
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 170
 ; CHECK-RV32-NEXT:    li a4, 169
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_703
 ; CHECK-RV32-NEXT:    j .LBB61_182
 ; CHECK-RV32-NEXT:  .LBB61_703: # %cond.load677
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 171
 ; CHECK-RV32-NEXT:    li a4, 170
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_704
 ; CHECK-RV32-NEXT:    j .LBB61_183
 ; CHECK-RV32-NEXT:  .LBB61_704: # %cond.load681
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 172
 ; CHECK-RV32-NEXT:    li a4, 171
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_705
 ; CHECK-RV32-NEXT:    j .LBB61_184
 ; CHECK-RV32-NEXT:  .LBB61_705: # %cond.load685
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 173
 ; CHECK-RV32-NEXT:    li a4, 172
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_706
 ; CHECK-RV32-NEXT:    j .LBB61_185
 ; CHECK-RV32-NEXT:  .LBB61_706: # %cond.load689
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 174
 ; CHECK-RV32-NEXT:    li a4, 173
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_707
 ; CHECK-RV32-NEXT:    j .LBB61_186
 ; CHECK-RV32-NEXT:  .LBB61_707: # %cond.load693
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 175
 ; CHECK-RV32-NEXT:    li a4, 174
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_708
 ; CHECK-RV32-NEXT:    j .LBB61_187
 ; CHECK-RV32-NEXT:  .LBB61_708: # %cond.load697
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 176
 ; CHECK-RV32-NEXT:    li a4, 175
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_709
 ; CHECK-RV32-NEXT:    j .LBB61_188
 ; CHECK-RV32-NEXT:  .LBB61_709: # %cond.load701
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 177
 ; CHECK-RV32-NEXT:    li a4, 176
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_710
 ; CHECK-RV32-NEXT:    j .LBB61_189
 ; CHECK-RV32-NEXT:  .LBB61_710: # %cond.load705
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 178
 ; CHECK-RV32-NEXT:    li a4, 177
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_711
 ; CHECK-RV32-NEXT:    j .LBB61_190
 ; CHECK-RV32-NEXT:  .LBB61_711: # %cond.load709
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 179
 ; CHECK-RV32-NEXT:    li a4, 178
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_712
 ; CHECK-RV32-NEXT:    j .LBB61_191
 ; CHECK-RV32-NEXT:  .LBB61_712: # %cond.load713
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 180
 ; CHECK-RV32-NEXT:    li a4, 179
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_713
 ; CHECK-RV32-NEXT:    j .LBB61_192
 ; CHECK-RV32-NEXT:  .LBB61_713: # %cond.load717
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 181
 ; CHECK-RV32-NEXT:    li a4, 180
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_714
 ; CHECK-RV32-NEXT:    j .LBB61_193
 ; CHECK-RV32-NEXT:  .LBB61_714: # %cond.load721
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 182
 ; CHECK-RV32-NEXT:    li a4, 181
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_715
 ; CHECK-RV32-NEXT:    j .LBB61_194
 ; CHECK-RV32-NEXT:  .LBB61_715: # %cond.load725
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 183
 ; CHECK-RV32-NEXT:    li a4, 182
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_716
 ; CHECK-RV32-NEXT:    j .LBB61_195
 ; CHECK-RV32-NEXT:  .LBB61_716: # %cond.load729
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 184
 ; CHECK-RV32-NEXT:    li a4, 183
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_717
 ; CHECK-RV32-NEXT:    j .LBB61_196
 ; CHECK-RV32-NEXT:  .LBB61_717: # %cond.load733
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 185
 ; CHECK-RV32-NEXT:    li a4, 184
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_718
 ; CHECK-RV32-NEXT:    j .LBB61_197
 ; CHECK-RV32-NEXT:  .LBB61_718: # %cond.load737
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 186
 ; CHECK-RV32-NEXT:    li a4, 185
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_719
 ; CHECK-RV32-NEXT:    j .LBB61_198
 ; CHECK-RV32-NEXT:  .LBB61_719: # %cond.load741
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 187
 ; CHECK-RV32-NEXT:    li a4, 186
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_720
 ; CHECK-RV32-NEXT:    j .LBB61_199
 ; CHECK-RV32-NEXT:  .LBB61_720: # %cond.load745
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 188
 ; CHECK-RV32-NEXT:    li a4, 187
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_721
 ; CHECK-RV32-NEXT:    j .LBB61_200
 ; CHECK-RV32-NEXT:  .LBB61_721: # %cond.load749
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 189
 ; CHECK-RV32-NEXT:    li a4, 188
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1030
 ; CHECK-RV32-NEXT:    j .LBB61_201
@@ -6678,12 +6625,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_202
 ; CHECK-RV32-NEXT:  .LBB61_722: # %cond.load761
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 192
 ; CHECK-RV32-NEXT:    li a4, 191
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6693,13 +6640,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_723: # %cond.load765
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 193
 ; CHECK-RV32-NEXT:    li a4, 192
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6709,13 +6656,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_724: # %cond.load769
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 194
 ; CHECK-RV32-NEXT:    li a4, 193
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6725,13 +6672,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_725: # %cond.load773
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 195
 ; CHECK-RV32-NEXT:    li a4, 194
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6741,13 +6688,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_726: # %cond.load777
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 196
 ; CHECK-RV32-NEXT:    li a4, 195
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6757,13 +6704,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_727: # %cond.load781
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 197
 ; CHECK-RV32-NEXT:    li a4, 196
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6773,13 +6720,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_728: # %cond.load785
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 198
 ; CHECK-RV32-NEXT:    li a4, 197
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6789,13 +6736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_729: # %cond.load789
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 199
 ; CHECK-RV32-NEXT:    li a4, 198
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6805,13 +6752,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_730: # %cond.load793
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 200
 ; CHECK-RV32-NEXT:    li a4, 199
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6821,13 +6768,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_731: # %cond.load797
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 201
 ; CHECK-RV32-NEXT:    li a4, 200
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6837,13 +6784,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_732: # %cond.load801
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 202
 ; CHECK-RV32-NEXT:    li a4, 201
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6853,13 +6800,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_733: # %cond.load805
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 203
 ; CHECK-RV32-NEXT:    li a4, 202
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6869,13 +6816,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_734: # %cond.load809
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 204
 ; CHECK-RV32-NEXT:    li a4, 203
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6885,13 +6832,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_735: # %cond.load813
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 205
 ; CHECK-RV32-NEXT:    li a4, 204
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6901,13 +6848,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_736: # %cond.load817
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 206
 ; CHECK-RV32-NEXT:    li a4, 205
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6917,13 +6864,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_737: # %cond.load821
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 207
 ; CHECK-RV32-NEXT:    li a4, 206
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6933,13 +6880,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_738: # %cond.load825
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 208
 ; CHECK-RV32-NEXT:    li a4, 207
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6949,13 +6896,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_739: # %cond.load829
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 209
 ; CHECK-RV32-NEXT:    li a4, 208
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6965,13 +6912,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_740: # %cond.load833
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 210
 ; CHECK-RV32-NEXT:    li a4, 209
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6981,13 +6928,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_741: # %cond.load837
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 211
 ; CHECK-RV32-NEXT:    li a4, 210
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6997,13 +6944,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_742: # %cond.load841
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 212
 ; CHECK-RV32-NEXT:    li a4, 211
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7013,13 +6960,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_743: # %cond.load845
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 213
 ; CHECK-RV32-NEXT:    li a4, 212
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7029,13 +6976,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_744: # %cond.load849
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 214
 ; CHECK-RV32-NEXT:    li a4, 213
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7045,13 +6992,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_745: # %cond.load853
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 215
 ; CHECK-RV32-NEXT:    li a4, 214
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7061,13 +7008,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_746: # %cond.load857
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 216
 ; CHECK-RV32-NEXT:    li a4, 215
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7077,13 +7024,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_747: # %cond.load861
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 217
 ; CHECK-RV32-NEXT:    li a4, 216
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7093,13 +7040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_748: # %cond.load865
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 218
 ; CHECK-RV32-NEXT:    li a4, 217
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7109,13 +7056,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_749: # %cond.load869
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 219
 ; CHECK-RV32-NEXT:    li a4, 218
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7125,13 +7072,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_750: # %cond.load873
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 220
 ; CHECK-RV32-NEXT:    li a4, 219
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7141,13 +7088,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_751: # %cond.load877
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 221
 ; CHECK-RV32-NEXT:    li a4, 220
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7158,479 +7105,479 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_236
 ; CHECK-RV32-NEXT:  .LBB61_752: # %cond.load889
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 224
 ; CHECK-RV32-NEXT:    li a4, 223
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_753
 ; CHECK-RV32-NEXT:    j .LBB61_240
 ; CHECK-RV32-NEXT:  .LBB61_753: # %cond.load893
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 225
 ; CHECK-RV32-NEXT:    li a4, 224
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_754
 ; CHECK-RV32-NEXT:    j .LBB61_241
 ; CHECK-RV32-NEXT:  .LBB61_754: # %cond.load897
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 226
 ; CHECK-RV32-NEXT:    li a4, 225
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_755
 ; CHECK-RV32-NEXT:    j .LBB61_242
 ; CHECK-RV32-NEXT:  .LBB61_755: # %cond.load901
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 227
 ; CHECK-RV32-NEXT:    li a4, 226
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_756
 ; CHECK-RV32-NEXT:    j .LBB61_243
 ; CHECK-RV32-NEXT:  .LBB61_756: # %cond.load905
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 228
 ; CHECK-RV32-NEXT:    li a4, 227
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_757
 ; CHECK-RV32-NEXT:    j .LBB61_244
 ; CHECK-RV32-NEXT:  .LBB61_757: # %cond.load909
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 229
 ; CHECK-RV32-NEXT:    li a4, 228
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_758
 ; CHECK-RV32-NEXT:    j .LBB61_245
 ; CHECK-RV32-NEXT:  .LBB61_758: # %cond.load913
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 230
 ; CHECK-RV32-NEXT:    li a4, 229
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_759
 ; CHECK-RV32-NEXT:    j .LBB61_246
 ; CHECK-RV32-NEXT:  .LBB61_759: # %cond.load917
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 231
 ; CHECK-RV32-NEXT:    li a4, 230
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_760
 ; CHECK-RV32-NEXT:    j .LBB61_247
 ; CHECK-RV32-NEXT:  .LBB61_760: # %cond.load921
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 232
 ; CHECK-RV32-NEXT:    li a4, 231
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_761
 ; CHECK-RV32-NEXT:    j .LBB61_248
 ; CHECK-RV32-NEXT:  .LBB61_761: # %cond.load925
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 233
 ; CHECK-RV32-NEXT:    li a4, 232
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_762
 ; CHECK-RV32-NEXT:    j .LBB61_249
 ; CHECK-RV32-NEXT:  .LBB61_762: # %cond.load929
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 234
 ; CHECK-RV32-NEXT:    li a4, 233
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_763
 ; CHECK-RV32-NEXT:    j .LBB61_250
 ; CHECK-RV32-NEXT:  .LBB61_763: # %cond.load933
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 235
 ; CHECK-RV32-NEXT:    li a4, 234
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_764
 ; CHECK-RV32-NEXT:    j .LBB61_251
 ; CHECK-RV32-NEXT:  .LBB61_764: # %cond.load937
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 236
 ; CHECK-RV32-NEXT:    li a4, 235
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_765
 ; CHECK-RV32-NEXT:    j .LBB61_252
 ; CHECK-RV32-NEXT:  .LBB61_765: # %cond.load941
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 237
 ; CHECK-RV32-NEXT:    li a4, 236
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_766
 ; CHECK-RV32-NEXT:    j .LBB61_253
 ; CHECK-RV32-NEXT:  .LBB61_766: # %cond.load945
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 238
 ; CHECK-RV32-NEXT:    li a4, 237
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_767
 ; CHECK-RV32-NEXT:    j .LBB61_254
 ; CHECK-RV32-NEXT:  .LBB61_767: # %cond.load949
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 239
 ; CHECK-RV32-NEXT:    li a4, 238
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_768
 ; CHECK-RV32-NEXT:    j .LBB61_255
 ; CHECK-RV32-NEXT:  .LBB61_768: # %cond.load953
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 240
 ; CHECK-RV32-NEXT:    li a4, 239
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_769
 ; CHECK-RV32-NEXT:    j .LBB61_256
 ; CHECK-RV32-NEXT:  .LBB61_769: # %cond.load957
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 241
 ; CHECK-RV32-NEXT:    li a4, 240
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_770
 ; CHECK-RV32-NEXT:    j .LBB61_257
 ; CHECK-RV32-NEXT:  .LBB61_770: # %cond.load961
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 242
 ; CHECK-RV32-NEXT:    li a4, 241
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_771
 ; CHECK-RV32-NEXT:    j .LBB61_258
 ; CHECK-RV32-NEXT:  .LBB61_771: # %cond.load965
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 243
 ; CHECK-RV32-NEXT:    li a4, 242
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_772
 ; CHECK-RV32-NEXT:    j .LBB61_259
 ; CHECK-RV32-NEXT:  .LBB61_772: # %cond.load969
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 244
 ; CHECK-RV32-NEXT:    li a4, 243
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_773
 ; CHECK-RV32-NEXT:    j .LBB61_260
 ; CHECK-RV32-NEXT:  .LBB61_773: # %cond.load973
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 245
 ; CHECK-RV32-NEXT:    li a4, 244
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_774
 ; CHECK-RV32-NEXT:    j .LBB61_261
 ; CHECK-RV32-NEXT:  .LBB61_774: # %cond.load977
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 246
 ; CHECK-RV32-NEXT:    li a4, 245
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_775
 ; CHECK-RV32-NEXT:    j .LBB61_262
 ; CHECK-RV32-NEXT:  .LBB61_775: # %cond.load981
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 247
 ; CHECK-RV32-NEXT:    li a4, 246
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_776
 ; CHECK-RV32-NEXT:    j .LBB61_263
 ; CHECK-RV32-NEXT:  .LBB61_776: # %cond.load985
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 248
 ; CHECK-RV32-NEXT:    li a4, 247
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_777
 ; CHECK-RV32-NEXT:    j .LBB61_264
 ; CHECK-RV32-NEXT:  .LBB61_777: # %cond.load989
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 249
 ; CHECK-RV32-NEXT:    li a4, 248
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_778
 ; CHECK-RV32-NEXT:    j .LBB61_265
 ; CHECK-RV32-NEXT:  .LBB61_778: # %cond.load993
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 250
 ; CHECK-RV32-NEXT:    li a4, 249
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_779
 ; CHECK-RV32-NEXT:    j .LBB61_266
 ; CHECK-RV32-NEXT:  .LBB61_779: # %cond.load997
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 251
 ; CHECK-RV32-NEXT:    li a4, 250
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_780
 ; CHECK-RV32-NEXT:    j .LBB61_267
 ; CHECK-RV32-NEXT:  .LBB61_780: # %cond.load1001
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 252
 ; CHECK-RV32-NEXT:    li a4, 251
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_781
 ; CHECK-RV32-NEXT:    j .LBB61_268
 ; CHECK-RV32-NEXT:  .LBB61_781: # %cond.load1005
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 253
 ; CHECK-RV32-NEXT:    li a4, 252
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1032
 ; CHECK-RV32-NEXT:    j .LBB61_269
@@ -7638,12 +7585,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_270
 ; CHECK-RV32-NEXT:  .LBB61_782: # %cond.load1017
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 256
 ; CHECK-RV32-NEXT:    li a4, 255
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -11044,13 +10991,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_62: # %cond.load241
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 62
 ; CHECK-RV64-NEXT:    li a3, 61
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -11061,12 +11008,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_65
 ; CHECK-RV64-NEXT:  # %bb.64: # %cond.load245
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v17, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 63
 ; CHECK-RV64-NEXT:    li a3, 62
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v17, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -11325,13 +11272,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_128: # %cond.load497
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 126
 ; CHECK-RV64-NEXT:    li a3, 125
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -11342,12 +11289,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_131
 ; CHECK-RV64-NEXT:  # %bb.130: # %cond.load501
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 127
 ; CHECK-RV64-NEXT:    li a3, 126
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v18, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -11606,16 +11553,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_194: # %cond.load753
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 190
 ; CHECK-RV64-NEXT:    li a3, 189
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_195: # %else754
 ; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11623,12 +11570,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_197
 ; CHECK-RV64-NEXT:  # %bb.196: # %cond.load757
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v20, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 191
 ; CHECK-RV64-NEXT:    li a3, 190
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v20, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -11887,16 +11834,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_260: # %cond.load1009
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 254
 ; CHECK-RV64-NEXT:    li a3, 253
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_261: # %else1010
 ; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11904,12 +11851,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_263
 ; CHECK-RV64-NEXT:  # %bb.262: # %cond.load1013
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 255
 ; CHECK-RV64-NEXT:    li a3, 254
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v20, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -13013,374 +12960,374 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_2
 ; CHECK-RV64-NEXT:  .LBB61_528: # %cond.load1
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 4
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_529
 ; CHECK-RV64-NEXT:    j .LBB61_3
 ; CHECK-RV64-NEXT:  .LBB61_529: # %cond.load5
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 3, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 2
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 8
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_530
 ; CHECK-RV64-NEXT:    j .LBB61_4
 ; CHECK-RV64-NEXT:  .LBB61_530: # %cond.load9
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 3
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 16
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_531
 ; CHECK-RV64-NEXT:    j .LBB61_5
 ; CHECK-RV64-NEXT:  .LBB61_531: # %cond.load13
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 5, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 32
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_532
 ; CHECK-RV64-NEXT:    j .LBB61_6
 ; CHECK-RV64-NEXT:  .LBB61_532: # %cond.load17
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 5
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 5
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 64
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_533
 ; CHECK-RV64-NEXT:    j .LBB61_7
 ; CHECK-RV64-NEXT:  .LBB61_533: # %cond.load21
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 7, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 6
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 6
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 128
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_534
 ; CHECK-RV64-NEXT:    j .LBB61_8
 ; CHECK-RV64-NEXT:  .LBB61_534: # %cond.load25
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 7
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 7
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 256
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_535
 ; CHECK-RV64-NEXT:    j .LBB61_9
 ; CHECK-RV64-NEXT:  .LBB61_535: # %cond.load29
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 9, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 512
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_536
 ; CHECK-RV64-NEXT:    j .LBB61_10
 ; CHECK-RV64-NEXT:  .LBB61_536: # %cond.load33
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 10, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 9
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 9
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1024
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_537
 ; CHECK-RV64-NEXT:    j .LBB61_11
 ; CHECK-RV64-NEXT:  .LBB61_537: # %cond.load37
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 11, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 10
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 10
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 52
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_538
 ; CHECK-RV64-NEXT:    j .LBB61_12
 ; CHECK-RV64-NEXT:  .LBB61_538: # %cond.load41
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 11
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 11
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 51
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_539
 ; CHECK-RV64-NEXT:    j .LBB61_13
 ; CHECK-RV64-NEXT:  .LBB61_539: # %cond.load45
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 13, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 12
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 12
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 50
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_540
 ; CHECK-RV64-NEXT:    j .LBB61_14
 ; CHECK-RV64-NEXT:  .LBB61_540: # %cond.load49
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 13
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 13
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 49
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_541
 ; CHECK-RV64-NEXT:    j .LBB61_15
 ; CHECK-RV64-NEXT:  .LBB61_541: # %cond.load53
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 15, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 14
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 14
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 48
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_542
 ; CHECK-RV64-NEXT:    j .LBB61_16
 ; CHECK-RV64-NEXT:  .LBB61_542: # %cond.load57
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 15
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 15
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 47
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_543
 ; CHECK-RV64-NEXT:    j .LBB61_17
 ; CHECK-RV64-NEXT:  .LBB61_543: # %cond.load61
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 17, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 46
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_544
 ; CHECK-RV64-NEXT:    j .LBB61_18
 ; CHECK-RV64-NEXT:  .LBB61_544: # %cond.load65
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 18, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 17
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 17
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 45
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_545
 ; CHECK-RV64-NEXT:    j .LBB61_19
 ; CHECK-RV64-NEXT:  .LBB61_545: # %cond.load69
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 19, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 18
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 18
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 44
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_546
 ; CHECK-RV64-NEXT:    j .LBB61_20
 ; CHECK-RV64-NEXT:  .LBB61_546: # %cond.load73
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 20, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 19
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 19
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 43
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_547
 ; CHECK-RV64-NEXT:    j .LBB61_21
 ; CHECK-RV64-NEXT:  .LBB61_547: # %cond.load77
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 21, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 20
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 20
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 42
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_548
 ; CHECK-RV64-NEXT:    j .LBB61_22
 ; CHECK-RV64-NEXT:  .LBB61_548: # %cond.load81
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 22, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 21
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 21
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 41
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_549
 ; CHECK-RV64-NEXT:    j .LBB61_23
 ; CHECK-RV64-NEXT:  .LBB61_549: # %cond.load85
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 23, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 22
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 22
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 40
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_550
 ; CHECK-RV64-NEXT:    j .LBB61_24
 ; CHECK-RV64-NEXT:  .LBB61_550: # %cond.load89
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 24, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 23
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 23
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 39
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_551
 ; CHECK-RV64-NEXT:    j .LBB61_25
 ; CHECK-RV64-NEXT:  .LBB61_551: # %cond.load93
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 25, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 24
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 24
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 38
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_552
 ; CHECK-RV64-NEXT:    j .LBB61_26
 ; CHECK-RV64-NEXT:  .LBB61_552: # %cond.load97
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 26, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 25
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 25
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 37
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_553
 ; CHECK-RV64-NEXT:    j .LBB61_27
 ; CHECK-RV64-NEXT:  .LBB61_553: # %cond.load101
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 27, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 26
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 26
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 36
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_554
 ; CHECK-RV64-NEXT:    j .LBB61_28
 ; CHECK-RV64-NEXT:  .LBB61_554: # %cond.load105
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 28, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 27
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 27
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 35
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_555
 ; CHECK-RV64-NEXT:    j .LBB61_29
 ; CHECK-RV64-NEXT:  .LBB61_555: # %cond.load109
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 29, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 28
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 28
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 34
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_556
 ; CHECK-RV64-NEXT:    j .LBB61_30
 ; CHECK-RV64-NEXT:  .LBB61_556: # %cond.load113
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 30, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 29
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 29
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 33
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_557
 ; CHECK-RV64-NEXT:    j .LBB61_31
 ; CHECK-RV64-NEXT:  .LBB61_557: # %cond.load117
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 31, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 30
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 30
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 32
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_558
 ; CHECK-RV64-NEXT:    j .LBB61_32
 ; CHECK-RV64-NEXT:  .LBB61_558: # %cond.load121
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 32
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vi v8, v24, 31
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 31
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13390,13 +13337,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_559: # %cond.load125
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 33
 ; CHECK-RV64-NEXT:    li a3, 32
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13406,13 +13353,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_560: # %cond.load129
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 34
 ; CHECK-RV64-NEXT:    li a3, 33
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13422,13 +13369,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_561: # %cond.load133
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 35
 ; CHECK-RV64-NEXT:    li a3, 34
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13438,13 +13385,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_562: # %cond.load137
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 36
 ; CHECK-RV64-NEXT:    li a3, 35
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13454,13 +13401,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_563: # %cond.load141
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 37
 ; CHECK-RV64-NEXT:    li a3, 36
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13470,13 +13417,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_564: # %cond.load145
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 38
 ; CHECK-RV64-NEXT:    li a3, 37
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13486,13 +13433,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_565: # %cond.load149
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 39
 ; CHECK-RV64-NEXT:    li a3, 38
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13502,13 +13449,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_566: # %cond.load153
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 40
 ; CHECK-RV64-NEXT:    li a3, 39
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13518,13 +13465,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_567: # %cond.load157
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 41
 ; CHECK-RV64-NEXT:    li a3, 40
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13534,13 +13481,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_568: # %cond.load161
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 42
 ; CHECK-RV64-NEXT:    li a3, 41
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13550,13 +13497,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_569: # %cond.load165
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 43
 ; CHECK-RV64-NEXT:    li a3, 42
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13566,13 +13513,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_570: # %cond.load169
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 44
 ; CHECK-RV64-NEXT:    li a3, 43
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13582,13 +13529,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_571: # %cond.load173
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 45
 ; CHECK-RV64-NEXT:    li a3, 44
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13598,13 +13545,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_572: # %cond.load177
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 46
 ; CHECK-RV64-NEXT:    li a3, 45
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13614,13 +13561,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_573: # %cond.load181
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 47
 ; CHECK-RV64-NEXT:    li a3, 46
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13630,13 +13577,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_574: # %cond.load185
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 48
 ; CHECK-RV64-NEXT:    li a3, 47
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13646,13 +13593,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_575: # %cond.load189
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 49
 ; CHECK-RV64-NEXT:    li a3, 48
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13662,13 +13609,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_576: # %cond.load193
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 50
 ; CHECK-RV64-NEXT:    li a3, 49
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13678,13 +13625,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_577: # %cond.load197
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 51
 ; CHECK-RV64-NEXT:    li a3, 50
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13694,13 +13641,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_578: # %cond.load201
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 52
 ; CHECK-RV64-NEXT:    li a3, 51
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13710,13 +13657,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_579: # %cond.load205
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 53
 ; CHECK-RV64-NEXT:    li a3, 52
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13726,13 +13673,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_580: # %cond.load209
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 54
 ; CHECK-RV64-NEXT:    li a3, 53
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13742,13 +13689,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_581: # %cond.load213
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 55
 ; CHECK-RV64-NEXT:    li a3, 54
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13758,13 +13705,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_582: # %cond.load217
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 56
 ; CHECK-RV64-NEXT:    li a3, 55
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13774,13 +13721,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_583: # %cond.load221
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 57
 ; CHECK-RV64-NEXT:    li a3, 56
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13790,13 +13737,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_584: # %cond.load225
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 58
 ; CHECK-RV64-NEXT:    li a3, 57
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13806,13 +13753,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_585: # %cond.load229
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 59
 ; CHECK-RV64-NEXT:    li a3, 58
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13822,13 +13769,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_586: # %cond.load233
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 60
 ; CHECK-RV64-NEXT:    li a3, 59
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13838,13 +13785,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_587: # %cond.load237
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 61
 ; CHECK-RV64-NEXT:    li a3, 60
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13855,12 +13802,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_63
 ; CHECK-RV64-NEXT:  .LBB61_588: # %cond.load249
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV64-NEXT:    li a2, 64
 ; CHECK-RV64-NEXT:    li a3, 63
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13870,13 +13817,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_589: # %cond.load253
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 65
 ; CHECK-RV64-NEXT:    li a3, 64
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13886,13 +13833,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_590: # %cond.load257
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 66
 ; CHECK-RV64-NEXT:    li a3, 65
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13902,13 +13849,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_591: # %cond.load261
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 67
 ; CHECK-RV64-NEXT:    li a3, 66
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13918,13 +13865,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_592: # %cond.load265
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 68
 ; CHECK-RV64-NEXT:    li a3, 67
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13934,13 +13881,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_593: # %cond.load269
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 69
 ; CHECK-RV64-NEXT:    li a3, 68
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13950,13 +13897,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_594: # %cond.load273
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 70
 ; CHECK-RV64-NEXT:    li a3, 69
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13966,13 +13913,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_595: # %cond.load277
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 71
 ; CHECK-RV64-NEXT:    li a3, 70
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13982,13 +13929,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_596: # %cond.load281
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 72
 ; CHECK-RV64-NEXT:    li a3, 71
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13998,13 +13945,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_597: # %cond.load285
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 73
 ; CHECK-RV64-NEXT:    li a3, 72
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14014,13 +13961,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_598: # %cond.load289
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 74
 ; CHECK-RV64-NEXT:    li a3, 73
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14030,13 +13977,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_599: # %cond.load293
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 75
 ; CHECK-RV64-NEXT:    li a3, 74
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14046,13 +13993,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_600: # %cond.load297
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 76
 ; CHECK-RV64-NEXT:    li a3, 75
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14062,13 +14009,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_601: # %cond.load301
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 77
 ; CHECK-RV64-NEXT:    li a3, 76
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14078,13 +14025,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_602: # %cond.load305
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 78
 ; CHECK-RV64-NEXT:    li a3, 77
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14094,13 +14041,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_603: # %cond.load309
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 79
 ; CHECK-RV64-NEXT:    li a3, 78
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14110,13 +14057,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_604: # %cond.load313
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 80
 ; CHECK-RV64-NEXT:    li a3, 79
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14126,13 +14073,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_605: # %cond.load317
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 81
 ; CHECK-RV64-NEXT:    li a3, 80
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14142,13 +14089,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_606: # %cond.load321
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 82
 ; CHECK-RV64-NEXT:    li a3, 81
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14158,13 +14105,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_607: # %cond.load325
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 83
 ; CHECK-RV64-NEXT:    li a3, 82
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14174,13 +14121,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_608: # %cond.load329
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 84
 ; CHECK-RV64-NEXT:    li a3, 83
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14190,13 +14137,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_609: # %cond.load333
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 85
 ; CHECK-RV64-NEXT:    li a3, 84
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14206,13 +14153,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_610: # %cond.load337
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 86
 ; CHECK-RV64-NEXT:    li a3, 85
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14222,13 +14169,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_611: # %cond.load341
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 87
 ; CHECK-RV64-NEXT:    li a3, 86
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14238,13 +14185,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_612: # %cond.load345
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 88
 ; CHECK-RV64-NEXT:    li a3, 87
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14254,13 +14201,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_613: # %cond.load349
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 89
 ; CHECK-RV64-NEXT:    li a3, 88
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14270,13 +14217,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_614: # %cond.load353
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 90
 ; CHECK-RV64-NEXT:    li a3, 89
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14286,13 +14233,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_615: # %cond.load357
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 91
 ; CHECK-RV64-NEXT:    li a3, 90
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14302,13 +14249,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_616: # %cond.load361
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 92
 ; CHECK-RV64-NEXT:    li a3, 91
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14318,13 +14265,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_617: # %cond.load365
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 93
 ; CHECK-RV64-NEXT:    li a3, 92
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14334,13 +14281,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_618: # %cond.load369
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 94
 ; CHECK-RV64-NEXT:    li a3, 93
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14350,13 +14297,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_619: # %cond.load373
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 95
 ; CHECK-RV64-NEXT:    li a3, 94
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14366,13 +14313,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_620: # %cond.load377
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 96
 ; CHECK-RV64-NEXT:    li a3, 95
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14382,13 +14329,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_621: # %cond.load381
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 97
 ; CHECK-RV64-NEXT:    li a3, 96
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14398,13 +14345,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_622: # %cond.load385
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 98
 ; CHECK-RV64-NEXT:    li a3, 97
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14414,13 +14361,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_623: # %cond.load389
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 99
 ; CHECK-RV64-NEXT:    li a3, 98
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14430,13 +14377,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_624: # %cond.load393
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 100
 ; CHECK-RV64-NEXT:    li a3, 99
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14446,13 +14393,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_625: # %cond.load397
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 101
 ; CHECK-RV64-NEXT:    li a3, 100
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14462,13 +14409,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_626: # %cond.load401
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 102
 ; CHECK-RV64-NEXT:    li a3, 101
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14478,13 +14425,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_627: # %cond.load405
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 103
 ; CHECK-RV64-NEXT:    li a3, 102
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14494,13 +14441,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_628: # %cond.load409
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 104
 ; CHECK-RV64-NEXT:    li a3, 103
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14510,13 +14457,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_629: # %cond.load413
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 105
 ; CHECK-RV64-NEXT:    li a3, 104
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14526,13 +14473,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_630: # %cond.load417
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 106
 ; CHECK-RV64-NEXT:    li a3, 105
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14542,13 +14489,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_631: # %cond.load421
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 107
 ; CHECK-RV64-NEXT:    li a3, 106
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14558,13 +14505,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_632: # %cond.load425
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 108
 ; CHECK-RV64-NEXT:    li a3, 107
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14574,13 +14521,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_633: # %cond.load429
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 109
 ; CHECK-RV64-NEXT:    li a3, 108
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14590,13 +14537,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_634: # %cond.load433
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 110
 ; CHECK-RV64-NEXT:    li a3, 109
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14606,13 +14553,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_635: # %cond.load437
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 111
 ; CHECK-RV64-NEXT:    li a3, 110
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14622,13 +14569,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_636: # %cond.load441
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 112
 ; CHECK-RV64-NEXT:    li a3, 111
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14638,13 +14585,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_637: # %cond.load445
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 113
 ; CHECK-RV64-NEXT:    li a3, 112
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14654,13 +14601,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_638: # %cond.load449
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 114
 ; CHECK-RV64-NEXT:    li a3, 113
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14670,13 +14617,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_639: # %cond.load453
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 115
 ; CHECK-RV64-NEXT:    li a3, 114
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14686,13 +14633,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_640: # %cond.load457
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 116
 ; CHECK-RV64-NEXT:    li a3, 115
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14702,13 +14649,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_641: # %cond.load461
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 117
 ; CHECK-RV64-NEXT:    li a3, 116
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14718,13 +14665,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_642: # %cond.load465
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 118
 ; CHECK-RV64-NEXT:    li a3, 117
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14734,13 +14681,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_643: # %cond.load469
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 119
 ; CHECK-RV64-NEXT:    li a3, 118
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14750,13 +14697,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_644: # %cond.load473
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 120
 ; CHECK-RV64-NEXT:    li a3, 119
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14766,13 +14713,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_645: # %cond.load477
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 121
 ; CHECK-RV64-NEXT:    li a3, 120
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14782,13 +14729,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_646: # %cond.load481
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 122
 ; CHECK-RV64-NEXT:    li a3, 121
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14798,13 +14745,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_647: # %cond.load485
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 123
 ; CHECK-RV64-NEXT:    li a3, 122
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14814,13 +14761,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_648: # %cond.load489
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 124
 ; CHECK-RV64-NEXT:    li a3, 123
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14830,13 +14777,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_649: # %cond.load493
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 125
 ; CHECK-RV64-NEXT:    li a3, 124
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14847,12 +14794,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_129
 ; CHECK-RV64-NEXT:  .LBB61_650: # %cond.load505
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v10, a1
 ; CHECK-RV64-NEXT:    li a1, 128
 ; CHECK-RV64-NEXT:    li a3, 127
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14862,976 +14809,976 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_651: # %cond.load509
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 129
 ; CHECK-RV64-NEXT:    li a3, 128
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 2
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_652
 ; CHECK-RV64-NEXT:    j .LBB61_134
 ; CHECK-RV64-NEXT:  .LBB61_652: # %cond.load513
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 130
 ; CHECK-RV64-NEXT:    li a3, 129
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 4
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_653
 ; CHECK-RV64-NEXT:    j .LBB61_135
 ; CHECK-RV64-NEXT:  .LBB61_653: # %cond.load517
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 131
 ; CHECK-RV64-NEXT:    li a3, 130
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 8
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_654
 ; CHECK-RV64-NEXT:    j .LBB61_136
 ; CHECK-RV64-NEXT:  .LBB61_654: # %cond.load521
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 132
 ; CHECK-RV64-NEXT:    li a3, 131
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 16
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_655
 ; CHECK-RV64-NEXT:    j .LBB61_137
 ; CHECK-RV64-NEXT:  .LBB61_655: # %cond.load525
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 133
 ; CHECK-RV64-NEXT:    li a3, 132
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 32
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_656
 ; CHECK-RV64-NEXT:    j .LBB61_138
 ; CHECK-RV64-NEXT:  .LBB61_656: # %cond.load529
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 134
 ; CHECK-RV64-NEXT:    li a3, 133
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 64
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_657
 ; CHECK-RV64-NEXT:    j .LBB61_139
 ; CHECK-RV64-NEXT:  .LBB61_657: # %cond.load533
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 135
 ; CHECK-RV64-NEXT:    li a3, 134
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 128
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_658
 ; CHECK-RV64-NEXT:    j .LBB61_140
 ; CHECK-RV64-NEXT:  .LBB61_658: # %cond.load537
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 136
 ; CHECK-RV64-NEXT:    li a3, 135
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 256
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_659
 ; CHECK-RV64-NEXT:    j .LBB61_141
 ; CHECK-RV64-NEXT:  .LBB61_659: # %cond.load541
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 137
 ; CHECK-RV64-NEXT:    li a3, 136
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 512
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_660
 ; CHECK-RV64-NEXT:    j .LBB61_142
 ; CHECK-RV64-NEXT:  .LBB61_660: # %cond.load545
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 138
 ; CHECK-RV64-NEXT:    li a3, 137
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1024
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_661
 ; CHECK-RV64-NEXT:    j .LBB61_143
 ; CHECK-RV64-NEXT:  .LBB61_661: # %cond.load549
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 139
 ; CHECK-RV64-NEXT:    li a3, 138
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 52
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_662
 ; CHECK-RV64-NEXT:    j .LBB61_144
 ; CHECK-RV64-NEXT:  .LBB61_662: # %cond.load553
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 140
 ; CHECK-RV64-NEXT:    li a3, 139
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 51
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_663
 ; CHECK-RV64-NEXT:    j .LBB61_145
 ; CHECK-RV64-NEXT:  .LBB61_663: # %cond.load557
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 141
 ; CHECK-RV64-NEXT:    li a3, 140
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 50
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_664
 ; CHECK-RV64-NEXT:    j .LBB61_146
 ; CHECK-RV64-NEXT:  .LBB61_664: # %cond.load561
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 142
 ; CHECK-RV64-NEXT:    li a3, 141
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 49
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_665
 ; CHECK-RV64-NEXT:    j .LBB61_147
 ; CHECK-RV64-NEXT:  .LBB61_665: # %cond.load565
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 143
 ; CHECK-RV64-NEXT:    li a3, 142
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 48
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_666
 ; CHECK-RV64-NEXT:    j .LBB61_148
 ; CHECK-RV64-NEXT:  .LBB61_666: # %cond.load569
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 144
 ; CHECK-RV64-NEXT:    li a3, 143
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 47
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_667
 ; CHECK-RV64-NEXT:    j .LBB61_149
 ; CHECK-RV64-NEXT:  .LBB61_667: # %cond.load573
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 145
 ; CHECK-RV64-NEXT:    li a3, 144
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 46
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_668
 ; CHECK-RV64-NEXT:    j .LBB61_150
 ; CHECK-RV64-NEXT:  .LBB61_668: # %cond.load577
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 146
 ; CHECK-RV64-NEXT:    li a3, 145
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 45
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_669
 ; CHECK-RV64-NEXT:    j .LBB61_151
 ; CHECK-RV64-NEXT:  .LBB61_669: # %cond.load581
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 147
 ; CHECK-RV64-NEXT:    li a3, 146
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 44
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_670
 ; CHECK-RV64-NEXT:    j .LBB61_152
 ; CHECK-RV64-NEXT:  .LBB61_670: # %cond.load585
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 148
 ; CHECK-RV64-NEXT:    li a3, 147
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 43
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_671
 ; CHECK-RV64-NEXT:    j .LBB61_153
 ; CHECK-RV64-NEXT:  .LBB61_671: # %cond.load589
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 149
 ; CHECK-RV64-NEXT:    li a3, 148
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 42
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_672
 ; CHECK-RV64-NEXT:    j .LBB61_154
 ; CHECK-RV64-NEXT:  .LBB61_672: # %cond.load593
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 150
 ; CHECK-RV64-NEXT:    li a3, 149
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 41
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_673
 ; CHECK-RV64-NEXT:    j .LBB61_155
 ; CHECK-RV64-NEXT:  .LBB61_673: # %cond.load597
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 151
 ; CHECK-RV64-NEXT:    li a3, 150
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 40
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_674
 ; CHECK-RV64-NEXT:    j .LBB61_156
 ; CHECK-RV64-NEXT:  .LBB61_674: # %cond.load601
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 152
 ; CHECK-RV64-NEXT:    li a3, 151
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 39
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_675
 ; CHECK-RV64-NEXT:    j .LBB61_157
 ; CHECK-RV64-NEXT:  .LBB61_675: # %cond.load605
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 153
 ; CHECK-RV64-NEXT:    li a3, 152
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 38
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_676
 ; CHECK-RV64-NEXT:    j .LBB61_158
 ; CHECK-RV64-NEXT:  .LBB61_676: # %cond.load609
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 154
 ; CHECK-RV64-NEXT:    li a3, 153
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 37
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_677
 ; CHECK-RV64-NEXT:    j .LBB61_159
 ; CHECK-RV64-NEXT:  .LBB61_677: # %cond.load613
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 155
 ; CHECK-RV64-NEXT:    li a3, 154
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 36
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_678
 ; CHECK-RV64-NEXT:    j .LBB61_160
 ; CHECK-RV64-NEXT:  .LBB61_678: # %cond.load617
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 156
 ; CHECK-RV64-NEXT:    li a3, 155
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 35
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_679
 ; CHECK-RV64-NEXT:    j .LBB61_161
 ; CHECK-RV64-NEXT:  .LBB61_679: # %cond.load621
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 157
 ; CHECK-RV64-NEXT:    li a3, 156
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 34
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_680
 ; CHECK-RV64-NEXT:    j .LBB61_162
 ; CHECK-RV64-NEXT:  .LBB61_680: # %cond.load625
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 158
 ; CHECK-RV64-NEXT:    li a3, 157
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 33
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_681
 ; CHECK-RV64-NEXT:    j .LBB61_163
 ; CHECK-RV64-NEXT:  .LBB61_681: # %cond.load629
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 159
 ; CHECK-RV64-NEXT:    li a3, 158
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 32
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_682
 ; CHECK-RV64-NEXT:    j .LBB61_164
 ; CHECK-RV64-NEXT:  .LBB61_682: # %cond.load633
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 160
 ; CHECK-RV64-NEXT:    li a3, 159
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 31
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_683
 ; CHECK-RV64-NEXT:    j .LBB61_165
 ; CHECK-RV64-NEXT:  .LBB61_683: # %cond.load637
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 161
 ; CHECK-RV64-NEXT:    li a3, 160
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 30
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_684
 ; CHECK-RV64-NEXT:    j .LBB61_166
 ; CHECK-RV64-NEXT:  .LBB61_684: # %cond.load641
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 162
 ; CHECK-RV64-NEXT:    li a3, 161
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 29
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_685
 ; CHECK-RV64-NEXT:    j .LBB61_167
 ; CHECK-RV64-NEXT:  .LBB61_685: # %cond.load645
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 163
 ; CHECK-RV64-NEXT:    li a3, 162
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 28
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_686
 ; CHECK-RV64-NEXT:    j .LBB61_168
 ; CHECK-RV64-NEXT:  .LBB61_686: # %cond.load649
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 164
 ; CHECK-RV64-NEXT:    li a3, 163
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 27
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_687
 ; CHECK-RV64-NEXT:    j .LBB61_169
 ; CHECK-RV64-NEXT:  .LBB61_687: # %cond.load653
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 165
 ; CHECK-RV64-NEXT:    li a3, 164
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 26
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_688
 ; CHECK-RV64-NEXT:    j .LBB61_170
 ; CHECK-RV64-NEXT:  .LBB61_688: # %cond.load657
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 166
 ; CHECK-RV64-NEXT:    li a3, 165
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 25
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_689
 ; CHECK-RV64-NEXT:    j .LBB61_171
 ; CHECK-RV64-NEXT:  .LBB61_689: # %cond.load661
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 167
 ; CHECK-RV64-NEXT:    li a3, 166
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 24
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_690
 ; CHECK-RV64-NEXT:    j .LBB61_172
 ; CHECK-RV64-NEXT:  .LBB61_690: # %cond.load665
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 168
 ; CHECK-RV64-NEXT:    li a3, 167
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 23
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_691
 ; CHECK-RV64-NEXT:    j .LBB61_173
 ; CHECK-RV64-NEXT:  .LBB61_691: # %cond.load669
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 169
 ; CHECK-RV64-NEXT:    li a3, 168
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 22
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_692
 ; CHECK-RV64-NEXT:    j .LBB61_174
 ; CHECK-RV64-NEXT:  .LBB61_692: # %cond.load673
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 170
 ; CHECK-RV64-NEXT:    li a3, 169
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 21
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_693
 ; CHECK-RV64-NEXT:    j .LBB61_175
 ; CHECK-RV64-NEXT:  .LBB61_693: # %cond.load677
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 171
 ; CHECK-RV64-NEXT:    li a3, 170
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 20
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_694
 ; CHECK-RV64-NEXT:    j .LBB61_176
 ; CHECK-RV64-NEXT:  .LBB61_694: # %cond.load681
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 172
 ; CHECK-RV64-NEXT:    li a3, 171
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 19
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_695
 ; CHECK-RV64-NEXT:    j .LBB61_177
 ; CHECK-RV64-NEXT:  .LBB61_695: # %cond.load685
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 173
 ; CHECK-RV64-NEXT:    li a3, 172
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 18
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_696
 ; CHECK-RV64-NEXT:    j .LBB61_178
 ; CHECK-RV64-NEXT:  .LBB61_696: # %cond.load689
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 174
 ; CHECK-RV64-NEXT:    li a3, 173
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 17
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_697
 ; CHECK-RV64-NEXT:    j .LBB61_179
 ; CHECK-RV64-NEXT:  .LBB61_697: # %cond.load693
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 175
 ; CHECK-RV64-NEXT:    li a3, 174
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 16
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_698
 ; CHECK-RV64-NEXT:    j .LBB61_180
 ; CHECK-RV64-NEXT:  .LBB61_698: # %cond.load697
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 176
 ; CHECK-RV64-NEXT:    li a3, 175
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 15
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_699
 ; CHECK-RV64-NEXT:    j .LBB61_181
 ; CHECK-RV64-NEXT:  .LBB61_699: # %cond.load701
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 177
 ; CHECK-RV64-NEXT:    li a3, 176
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 14
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_700
 ; CHECK-RV64-NEXT:    j .LBB61_182
 ; CHECK-RV64-NEXT:  .LBB61_700: # %cond.load705
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 178
 ; CHECK-RV64-NEXT:    li a3, 177
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 13
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_701
 ; CHECK-RV64-NEXT:    j .LBB61_183
 ; CHECK-RV64-NEXT:  .LBB61_701: # %cond.load709
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 179
 ; CHECK-RV64-NEXT:    li a3, 178
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 12
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_702
 ; CHECK-RV64-NEXT:    j .LBB61_184
 ; CHECK-RV64-NEXT:  .LBB61_702: # %cond.load713
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 180
 ; CHECK-RV64-NEXT:    li a3, 179
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 11
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_703
 ; CHECK-RV64-NEXT:    j .LBB61_185
 ; CHECK-RV64-NEXT:  .LBB61_703: # %cond.load717
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 181
 ; CHECK-RV64-NEXT:    li a3, 180
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 10
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_704
 ; CHECK-RV64-NEXT:    j .LBB61_186
 ; CHECK-RV64-NEXT:  .LBB61_704: # %cond.load721
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 182
 ; CHECK-RV64-NEXT:    li a3, 181
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 9
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_705
 ; CHECK-RV64-NEXT:    j .LBB61_187
 ; CHECK-RV64-NEXT:  .LBB61_705: # %cond.load725
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 183
 ; CHECK-RV64-NEXT:    li a3, 182
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 8
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_706
 ; CHECK-RV64-NEXT:    j .LBB61_188
 ; CHECK-RV64-NEXT:  .LBB61_706: # %cond.load729
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 184
 ; CHECK-RV64-NEXT:    li a3, 183
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 7
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_707
 ; CHECK-RV64-NEXT:    j .LBB61_189
 ; CHECK-RV64-NEXT:  .LBB61_707: # %cond.load733
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 185
 ; CHECK-RV64-NEXT:    li a3, 184
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 6
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_708
 ; CHECK-RV64-NEXT:    j .LBB61_190
 ; CHECK-RV64-NEXT:  .LBB61_708: # %cond.load737
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 186
 ; CHECK-RV64-NEXT:    li a3, 185
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 5
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_709
 ; CHECK-RV64-NEXT:    j .LBB61_191
 ; CHECK-RV64-NEXT:  .LBB61_709: # %cond.load741
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 187
 ; CHECK-RV64-NEXT:    li a3, 186
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 4
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_710
 ; CHECK-RV64-NEXT:    j .LBB61_192
 ; CHECK-RV64-NEXT:  .LBB61_710: # %cond.load745
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 188
 ; CHECK-RV64-NEXT:    li a3, 187
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 3
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_711
 ; CHECK-RV64-NEXT:    j .LBB61_193
 ; CHECK-RV64-NEXT:  .LBB61_711: # %cond.load749
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 189
 ; CHECK-RV64-NEXT:    li a3, 188
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 2
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_1027
 ; CHECK-RV64-NEXT:    j .LBB61_194
@@ -15839,991 +15786,991 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_195
 ; CHECK-RV64-NEXT:  .LBB61_712: # %cond.load761
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 192
 ; CHECK-RV64-NEXT:    li a3, 191
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_713
 ; CHECK-RV64-NEXT:    j .LBB61_199
 ; CHECK-RV64-NEXT:  .LBB61_713: # %cond.load765
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 193
 ; CHECK-RV64-NEXT:    li a3, 192
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 2
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_714
 ; CHECK-RV64-NEXT:    j .LBB61_200
 ; CHECK-RV64-NEXT:  .LBB61_714: # %cond.load769
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 194
 ; CHECK-RV64-NEXT:    li a3, 193
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 4
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_715
 ; CHECK-RV64-NEXT:    j .LBB61_201
 ; CHECK-RV64-NEXT:  .LBB61_715: # %cond.load773
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 195
 ; CHECK-RV64-NEXT:    li a3, 194
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 8
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_716
 ; CHECK-RV64-NEXT:    j .LBB61_202
 ; CHECK-RV64-NEXT:  .LBB61_716: # %cond.load777
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 196
 ; CHECK-RV64-NEXT:    li a3, 195
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 16
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_717
 ; CHECK-RV64-NEXT:    j .LBB61_203
 ; CHECK-RV64-NEXT:  .LBB61_717: # %cond.load781
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 197
 ; CHECK-RV64-NEXT:    li a3, 196
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 32
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_718
 ; CHECK-RV64-NEXT:    j .LBB61_204
 ; CHECK-RV64-NEXT:  .LBB61_718: # %cond.load785
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 198
 ; CHECK-RV64-NEXT:    li a3, 197
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 64
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_719
 ; CHECK-RV64-NEXT:    j .LBB61_205
 ; CHECK-RV64-NEXT:  .LBB61_719: # %cond.load789
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 199
 ; CHECK-RV64-NEXT:    li a3, 198
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 128
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_720
 ; CHECK-RV64-NEXT:    j .LBB61_206
 ; CHECK-RV64-NEXT:  .LBB61_720: # %cond.load793
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 200
 ; CHECK-RV64-NEXT:    li a3, 199
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 256
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_721
 ; CHECK-RV64-NEXT:    j .LBB61_207
 ; CHECK-RV64-NEXT:  .LBB61_721: # %cond.load797
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 201
 ; CHECK-RV64-NEXT:    li a3, 200
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 512
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_722
 ; CHECK-RV64-NEXT:    j .LBB61_208
 ; CHECK-RV64-NEXT:  .LBB61_722: # %cond.load801
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 202
 ; CHECK-RV64-NEXT:    li a3, 201
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1024
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_723
 ; CHECK-RV64-NEXT:    j .LBB61_209
 ; CHECK-RV64-NEXT:  .LBB61_723: # %cond.load805
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 203
 ; CHECK-RV64-NEXT:    li a3, 202
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 52
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_724
 ; CHECK-RV64-NEXT:    j .LBB61_210
 ; CHECK-RV64-NEXT:  .LBB61_724: # %cond.load809
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 204
 ; CHECK-RV64-NEXT:    li a3, 203
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 51
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_725
 ; CHECK-RV64-NEXT:    j .LBB61_211
 ; CHECK-RV64-NEXT:  .LBB61_725: # %cond.load813
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 205
 ; CHECK-RV64-NEXT:    li a3, 204
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 50
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_726
 ; CHECK-RV64-NEXT:    j .LBB61_212
 ; CHECK-RV64-NEXT:  .LBB61_726: # %cond.load817
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 206
 ; CHECK-RV64-NEXT:    li a3, 205
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 49
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_727
 ; CHECK-RV64-NEXT:    j .LBB61_213
 ; CHECK-RV64-NEXT:  .LBB61_727: # %cond.load821
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 207
 ; CHECK-RV64-NEXT:    li a3, 206
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 48
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_728
 ; CHECK-RV64-NEXT:    j .LBB61_214
 ; CHECK-RV64-NEXT:  .LBB61_728: # %cond.load825
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 208
 ; CHECK-RV64-NEXT:    li a3, 207
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 47
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_729
 ; CHECK-RV64-NEXT:    j .LBB61_215
 ; CHECK-RV64-NEXT:  .LBB61_729: # %cond.load829
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 209
 ; CHECK-RV64-NEXT:    li a3, 208
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 46
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_730
 ; CHECK-RV64-NEXT:    j .LBB61_216
 ; CHECK-RV64-NEXT:  .LBB61_730: # %cond.load833
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 210
 ; CHECK-RV64-NEXT:    li a3, 209
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 45
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_731
 ; CHECK-RV64-NEXT:    j .LBB61_217
 ; CHECK-RV64-NEXT:  .LBB61_731: # %cond.load837
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 211
 ; CHECK-RV64-NEXT:    li a3, 210
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 44
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_732
 ; CHECK-RV64-NEXT:    j .LBB61_218
 ; CHECK-RV64-NEXT:  .LBB61_732: # %cond.load841
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 212
 ; CHECK-RV64-NEXT:    li a3, 211
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 43
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_733
 ; CHECK-RV64-NEXT:    j .LBB61_219
 ; CHECK-RV64-NEXT:  .LBB61_733: # %cond.load845
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 213
 ; CHECK-RV64-NEXT:    li a3, 212
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 42
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_734
 ; CHECK-RV64-NEXT:    j .LBB61_220
 ; CHECK-RV64-NEXT:  .LBB61_734: # %cond.load849
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 214
 ; CHECK-RV64-NEXT:    li a3, 213
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 41
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_735
 ; CHECK-RV64-NEXT:    j .LBB61_221
 ; CHECK-RV64-NEXT:  .LBB61_735: # %cond.load853
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 215
 ; CHECK-RV64-NEXT:    li a3, 214
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 40
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_736
 ; CHECK-RV64-NEXT:    j .LBB61_222
 ; CHECK-RV64-NEXT:  .LBB61_736: # %cond.load857
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 216
 ; CHECK-RV64-NEXT:    li a3, 215
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 39
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_737
 ; CHECK-RV64-NEXT:    j .LBB61_223
 ; CHECK-RV64-NEXT:  .LBB61_737: # %cond.load861
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 217
 ; CHECK-RV64-NEXT:    li a3, 216
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 38
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_738
 ; CHECK-RV64-NEXT:    j .LBB61_224
 ; CHECK-RV64-NEXT:  .LBB61_738: # %cond.load865
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 218
 ; CHECK-RV64-NEXT:    li a3, 217
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 37
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_739
 ; CHECK-RV64-NEXT:    j .LBB61_225
 ; CHECK-RV64-NEXT:  .LBB61_739: # %cond.load869
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 219
 ; CHECK-RV64-NEXT:    li a3, 218
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 36
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_740
 ; CHECK-RV64-NEXT:    j .LBB61_226
 ; CHECK-RV64-NEXT:  .LBB61_740: # %cond.load873
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 220
 ; CHECK-RV64-NEXT:    li a3, 219
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 35
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_741
 ; CHECK-RV64-NEXT:    j .LBB61_227
 ; CHECK-RV64-NEXT:  .LBB61_741: # %cond.load877
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 221
 ; CHECK-RV64-NEXT:    li a3, 220
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 34
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_742
 ; CHECK-RV64-NEXT:    j .LBB61_228
 ; CHECK-RV64-NEXT:  .LBB61_742: # %cond.load881
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 222
 ; CHECK-RV64-NEXT:    li a3, 221
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 33
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_743
 ; CHECK-RV64-NEXT:    j .LBB61_229
 ; CHECK-RV64-NEXT:  .LBB61_743: # %cond.load885
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 223
 ; CHECK-RV64-NEXT:    li a3, 222
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 32
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_744
 ; CHECK-RV64-NEXT:    j .LBB61_230
 ; CHECK-RV64-NEXT:  .LBB61_744: # %cond.load889
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 224
 ; CHECK-RV64-NEXT:    li a3, 223
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 31
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_745
 ; CHECK-RV64-NEXT:    j .LBB61_231
 ; CHECK-RV64-NEXT:  .LBB61_745: # %cond.load893
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 225
 ; CHECK-RV64-NEXT:    li a3, 224
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 30
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_746
 ; CHECK-RV64-NEXT:    j .LBB61_232
 ; CHECK-RV64-NEXT:  .LBB61_746: # %cond.load897
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 226
 ; CHECK-RV64-NEXT:    li a3, 225
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 29
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_747
 ; CHECK-RV64-NEXT:    j .LBB61_233
 ; CHECK-RV64-NEXT:  .LBB61_747: # %cond.load901
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 227
 ; CHECK-RV64-NEXT:    li a3, 226
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 28
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_748
 ; CHECK-RV64-NEXT:    j .LBB61_234
 ; CHECK-RV64-NEXT:  .LBB61_748: # %cond.load905
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 228
 ; CHECK-RV64-NEXT:    li a3, 227
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 27
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_749
 ; CHECK-RV64-NEXT:    j .LBB61_235
 ; CHECK-RV64-NEXT:  .LBB61_749: # %cond.load909
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 229
 ; CHECK-RV64-NEXT:    li a3, 228
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 26
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_750
 ; CHECK-RV64-NEXT:    j .LBB61_236
 ; CHECK-RV64-NEXT:  .LBB61_750: # %cond.load913
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 230
 ; CHECK-RV64-NEXT:    li a3, 229
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 25
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_751
 ; CHECK-RV64-NEXT:    j .LBB61_237
 ; CHECK-RV64-NEXT:  .LBB61_751: # %cond.load917
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 231
 ; CHECK-RV64-NEXT:    li a3, 230
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 24
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_752
 ; CHECK-RV64-NEXT:    j .LBB61_238
 ; CHECK-RV64-NEXT:  .LBB61_752: # %cond.load921
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 232
 ; CHECK-RV64-NEXT:    li a3, 231
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 23
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_753
 ; CHECK-RV64-NEXT:    j .LBB61_239
 ; CHECK-RV64-NEXT:  .LBB61_753: # %cond.load925
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 233
 ; CHECK-RV64-NEXT:    li a3, 232
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 22
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_754
 ; CHECK-RV64-NEXT:    j .LBB61_240
 ; CHECK-RV64-NEXT:  .LBB61_754: # %cond.load929
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 234
 ; CHECK-RV64-NEXT:    li a3, 233
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 21
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_755
 ; CHECK-RV64-NEXT:    j .LBB61_241
 ; CHECK-RV64-NEXT:  .LBB61_755: # %cond.load933
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 235
 ; CHECK-RV64-NEXT:    li a3, 234
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 20
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_756
 ; CHECK-RV64-NEXT:    j .LBB61_242
 ; CHECK-RV64-NEXT:  .LBB61_756: # %cond.load937
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 236
 ; CHECK-RV64-NEXT:    li a3, 235
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 19
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_757
 ; CHECK-RV64-NEXT:    j .LBB61_243
 ; CHECK-RV64-NEXT:  .LBB61_757: # %cond.load941
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 237
 ; CHECK-RV64-NEXT:    li a3, 236
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 18
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_758
 ; CHECK-RV64-NEXT:    j .LBB61_244
 ; CHECK-RV64-NEXT:  .LBB61_758: # %cond.load945
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 238
 ; CHECK-RV64-NEXT:    li a3, 237
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 17
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_759
 ; CHECK-RV64-NEXT:    j .LBB61_245
 ; CHECK-RV64-NEXT:  .LBB61_759: # %cond.load949
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 239
 ; CHECK-RV64-NEXT:    li a3, 238
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 16
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_760
 ; CHECK-RV64-NEXT:    j .LBB61_246
 ; CHECK-RV64-NEXT:  .LBB61_760: # %cond.load953
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 240
 ; CHECK-RV64-NEXT:    li a3, 239
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 15
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_761
 ; CHECK-RV64-NEXT:    j .LBB61_247
 ; CHECK-RV64-NEXT:  .LBB61_761: # %cond.load957
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 241
 ; CHECK-RV64-NEXT:    li a3, 240
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 14
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_762
 ; CHECK-RV64-NEXT:    j .LBB61_248
 ; CHECK-RV64-NEXT:  .LBB61_762: # %cond.load961
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 242
 ; CHECK-RV64-NEXT:    li a3, 241
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 13
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_763
 ; CHECK-RV64-NEXT:    j .LBB61_249
 ; CHECK-RV64-NEXT:  .LBB61_763: # %cond.load965
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 243
 ; CHECK-RV64-NEXT:    li a3, 242
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 12
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_764
 ; CHECK-RV64-NEXT:    j .LBB61_250
 ; CHECK-RV64-NEXT:  .LBB61_764: # %cond.load969
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 244
 ; CHECK-RV64-NEXT:    li a3, 243
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 11
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_765
 ; CHECK-RV64-NEXT:    j .LBB61_251
 ; CHECK-RV64-NEXT:  .LBB61_765: # %cond.load973
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 245
 ; CHECK-RV64-NEXT:    li a3, 244
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 10
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_766
 ; CHECK-RV64-NEXT:    j .LBB61_252
 ; CHECK-RV64-NEXT:  .LBB61_766: # %cond.load977
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 246
 ; CHECK-RV64-NEXT:    li a3, 245
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 9
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_767
 ; CHECK-RV64-NEXT:    j .LBB61_253
 ; CHECK-RV64-NEXT:  .LBB61_767: # %cond.load981
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 247
 ; CHECK-RV64-NEXT:    li a3, 246
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 8
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_768
 ; CHECK-RV64-NEXT:    j .LBB61_254
 ; CHECK-RV64-NEXT:  .LBB61_768: # %cond.load985
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 248
 ; CHECK-RV64-NEXT:    li a3, 247
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 7
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_769
 ; CHECK-RV64-NEXT:    j .LBB61_255
 ; CHECK-RV64-NEXT:  .LBB61_769: # %cond.load989
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 249
 ; CHECK-RV64-NEXT:    li a3, 248
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 6
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_770
 ; CHECK-RV64-NEXT:    j .LBB61_256
 ; CHECK-RV64-NEXT:  .LBB61_770: # %cond.load993
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 250
 ; CHECK-RV64-NEXT:    li a3, 249
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 5
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_771
 ; CHECK-RV64-NEXT:    j .LBB61_257
 ; CHECK-RV64-NEXT:  .LBB61_771: # %cond.load997
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 251
 ; CHECK-RV64-NEXT:    li a3, 250
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 4
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_772
 ; CHECK-RV64-NEXT:    j .LBB61_258
 ; CHECK-RV64-NEXT:  .LBB61_772: # %cond.load1001
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 252
 ; CHECK-RV64-NEXT:    li a3, 251
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 3
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_773
 ; CHECK-RV64-NEXT:    j .LBB61_259
 ; CHECK-RV64-NEXT:  .LBB61_773: # %cond.load1005
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 253
 ; CHECK-RV64-NEXT:    li a3, 252
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 2
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_1028
 ; CHECK-RV64-NEXT:    j .LBB61_260
@@ -16831,15 +16778,15 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_261
 ; CHECK-RV64-NEXT:  .LBB61_774: # %cond.load1017
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 256
 ; CHECK-RV64-NEXT:    li a3, 255
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_775
 ; CHECK-RV64-NEXT:    j .LBB61_265
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index 8e9751502460ec..869478a1efa78d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -396,8 +396,8 @@ define <vscale x 2 x i1> @extract_nxv64i1_nxv2i1_2(<vscale x 64 x i1> %mask) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -421,8 +421,8 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv32i1_4(<vscale x 32 x i1> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 65f22370d729a0..d60ce408278da3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -1216,8 +1216,8 @@ define float @extractelt_fadd_nxv4f32_splat(<vscale x 4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 263168
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fadd.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1231,8 +1231,8 @@ define float @extractelt_fsub_nxv4f32_splat(<vscale x 4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 263168
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fsub.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
@@ -1246,8 +1246,8 @@ define float @extractelt_fmul_nxv4f32_splat(<vscale x 4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 263168
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fmul.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1296,12 +1296,12 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vs8r.v v16, (a2)
-; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    fld fa0, -8(a0)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -1329,13 +1329,13 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a1, a2, 3
-; RV64-NEXT:    add a3, a0, a1
 ; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    slli a3, a2, 3
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a3, a0, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vs8r.v v16, (a3)
 ; RV64-NEXT:    bltu a2, a1, .LBB70_2
@@ -1393,9 +1393,9 @@ define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %i
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    addi a2, sp, 64
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a0, a2, a0
 ; RV32-NEXT:    vs8r.v v8, (a2)
-; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    fld fa0, 0(a0)
@@ -1432,9 +1432,9 @@ define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %i
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    addi a2, sp, 64
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a0, a2, a0
 ; RV64-NEXT:    vs8r.v v8, (a2)
-; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    fld fa0, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index 14719e190a6934..796f8dde58f479 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -139,22 +139,22 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    addi a3, sp, 64
+; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vl8r.v v16, (a3)
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    vl8r.v v24, (a0)
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
-; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmseq.vi v0, v24, 0
+; RV32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32-NEXT:    vmseq.vi v0, v8, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    vmseq.vi v8, v24, 0
 ; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vs8r.v v24, (a0)
-; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v24, (a3)
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
-; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vs8r.v v8, (a2)
 ; RV32-NEXT:    lbu a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
@@ -179,22 +179,22 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    addi a3, sp, 64
+; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vl8r.v v16, (a3)
+; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    vl8r.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
+; RV64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64-NEXT:    vmseq.vi v0, v8, 0
 ; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    vmseq.vi v8, v24, 0
 ; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vs8r.v v24, (a0)
-; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vs8r.v v24, (a3)
 ; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
-; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vs8r.v v8, (a2)
 ; RV64-NEXT:    lbu a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index e6263ec9f00047..1474c73dacfc89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -613,8 +613,8 @@ define i64 @extractelt_nxv1i64_idx(<vscale x 1 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -654,8 +654,8 @@ define i64 @extractelt_nxv2i64_idx(<vscale x 2 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -695,8 +695,8 @@ define i64 @extractelt_nxv4i64_idx(<vscale x 4 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -736,8 +736,8 @@ define i64 @extractelt_nxv8i64_idx(<vscale x 8 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -876,12 +876,12 @@ define i32 @extractelt_nxv32i32_neg1(<vscale x 32 x i32> %v) {
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    addi a0, sp, 64
-; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    slli a2, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    vs8r.v v16, (a2)
-; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    lw a0, -4(a0)
 ; CHECK-NEXT:    addi sp, s0, -80
@@ -932,9 +932,9 @@ define i32 @extractelt_nxv32i32_idx(<vscale x 32 x i32> %v, i32 %idx) {
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    addi a2, sp, 64
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vs8r.v v16, (a1)
 ; CHECK-NEXT:    lw a0, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index d5c2b9e4842068..a9e129ef11a2cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -862,13 +862,13 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    addi a0, sp, 64
-; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 3
-; CHECK-NEXT:    add a3, a0, a1
 ; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    srli a1, a1, 32
 ; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vs8r.v v16, (a3)
 ; CHECK-NEXT:    bltu a2, a1, .LBB74_2
@@ -926,9 +926,9 @@ define i64 @extractelt_nxv16i64_idx(<vscale x 16 x i64> %v, i32 zeroext %idx) {
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    addi a2, sp, 64
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vs8r.v v16, (a1)
 ; CHECK-NEXT:    ld a0, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
index 4e549a5aa7c3a5..1626b362fed157 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
@@ -147,10 +147,10 @@ define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -170,10 +170,10 @@ define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -193,10 +193,10 @@ define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -216,10 +216,10 @@ define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -239,10 +239,10 @@ define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
index 35936574e8fe21..4aca2d694dfbbf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @ceil_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -40,9 +40,9 @@ define <vscale x 2 x bfloat> @ceil_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -63,9 +63,9 @@ define <vscale x 4 x bfloat> @ceil_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -86,9 +86,9 @@ define <vscale x 8 x bfloat> @ceil_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -109,9 +109,9 @@ define <vscale x 16 x bfloat> @ceil_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -132,9 +132,9 @@ define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -144,19 +144,21 @@ define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x bfloat> @llvm.ceil.nxv32bf16(<vscale x 32 x bfloat> %x)
   ret <vscale x 32 x bfloat> %a
@@ -182,9 +184,9 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -221,9 +223,9 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -260,9 +262,9 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -299,9 +301,9 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -338,9 +340,9 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -377,9 +379,9 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -389,19 +391,21 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.ceil.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
index f6b47743d1154c..d93f15ec440530 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
@@ -147,10 +147,10 @@ define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -170,10 +170,10 @@ define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -193,10 +193,10 @@ define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -216,10 +216,10 @@ define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -239,10 +239,10 @@ define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
index d26b74c7c139ec..010d7786c8891c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @floor_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -41,9 +41,9 @@ define <vscale x 2 x bfloat> @floor_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -65,9 +65,9 @@ define <vscale x 4 x bfloat> @floor_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -89,9 +89,9 @@ define <vscale x 8 x bfloat> @floor_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -113,9 +113,9 @@ define <vscale x 16 x bfloat> @floor_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -137,9 +137,9 @@ define <vscale x 32 x bfloat> @floor_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -149,19 +149,21 @@ define <vscale x 32 x bfloat> @floor_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x bfloat> @llvm.floor.nxv32bf16(<vscale x 32 x bfloat> %x)
   ret <vscale x 32 x bfloat> %a
@@ -188,9 +190,9 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -227,9 +229,9 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -266,9 +268,9 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -305,9 +307,9 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -344,9 +346,9 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -383,9 +385,9 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -395,19 +397,21 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.floor.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
index a4e7bb2f31048c..fea88673084a29 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
@@ -16,33 +16,33 @@ define <512 x i8> @single_source(<512 x i8> %a) {
 ; CHECK-NEXT:    addi s0, sp, 1536
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
 ; CHECK-NEXT:    andi sp, sp, -512
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a0, 512
 ; CHECK-NEXT:    addi a1, sp, 512
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a2, v8
+; CHECK-NEXT:    vslidedown.vi v24, v8, 5
+; CHECK-NEXT:    li a3, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a1)
-; CHECK-NEXT:    lbu a0, 770(sp)
-; CHECK-NEXT:    vmv.x.s a1, v16
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 5
-; CHECK-NEXT:    li a0, 432
-; CHECK-NEXT:    li a1, 431
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a1
+; CHECK-NEXT:    li a0, 431
+; CHECK-NEXT:    vmv.v.x v16, a2
+; CHECK-NEXT:    lbu a1, 770(sp)
+; CHECK-NEXT:    vslide1down.vx v16, v16, a1
+; CHECK-NEXT:    lbu a1, 1012(sp)
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v16, v24, a0
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v16, 4
+; CHECK-NEXT:    vslidedown.vi v24, v8, 4
 ; CHECK-NEXT:    li a0, 466
+; CHECK-NEXT:    vmv.s.x v8, a1
 ; CHECK-NEXT:    li a1, 465
-; CHECK-NEXT:    lbu a2, 1012(sp)
+; CHECK-NEXT:    li a2, 501
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
-; CHECK-NEXT:    vmv.s.x v16, a2
-; CHECK-NEXT:    li a0, 501
-; CHECK-NEXT:    li a1, 500
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    vslideup.vx v16, v24, a1
+; CHECK-NEXT:    li a0, 500
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v16, v8, a0
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    addi sp, s0, -1536
 ; CHECK-NEXT:    .cfi_def_cfa sp, 1536
 ; CHECK-NEXT:    ld ra, 1528(sp) # 8-byte Folded Reload
@@ -61,28 +61,28 @@ define <512 x i8> @range_restriction(<512 x i8> %a) {
 ; CHECK-LABEL: range_restriction:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    li a1, 254
+; CHECK-NEXT:    li a2, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    li a1, 254
 ; CHECK-NEXT:    vslide1down.vx v24, v16, a1
+; CHECK-NEXT:    li a1, 431
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 5
-; CHECK-NEXT:    li a1, 432
-; CHECK-NEXT:    li a2, 431
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v24, v16, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 4
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v24, v16, a1
 ; CHECK-NEXT:    li a1, 466
 ; CHECK-NEXT:    li a2, 465
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v24, v16, a2
 ; CHECK-NEXT:    li a1, 44
+; CHECK-NEXT:    li a2, 501
 ; CHECK-NEXT:    vmv.s.x v16, a1
-; CHECK-NEXT:    li a1, 501
-; CHECK-NEXT:    li a2, 500
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v24, v16, a2
+; CHECK-NEXT:    li a1, 500
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v24, v16, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
@@ -107,53 +107,53 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    li a0, 512
 ; CHECK-NEXT:    addi a1, sp, 512
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v24, 5
-; CHECK-NEXT:    vmv.x.s a1, v24
+; CHECK-NEXT:    vmv.x.s a2, v24
+; CHECK-NEXT:    li a3, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    li a1, 432
+; CHECK-NEXT:    vmv.v.x v8, a2
 ; CHECK-NEXT:    li a2, 431
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v0, a2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v24, 4
-; CHECK-NEXT:    li a1, 466
-; CHECK-NEXT:    li a2, 465
-; CHECK-NEXT:    lbu a3, 985(sp)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a2
-; CHECK-NEXT:    lbu a1, 1012(sp)
-; CHECK-NEXT:    vmv.s.x v24, a3
-; CHECK-NEXT:    li a2, 478
-; CHECK-NEXT:    li a3, 477
+; CHECK-NEXT:    vslidedown.vi v0, v24, 4
+; CHECK-NEXT:    li a2, 466
+; CHECK-NEXT:    li a3, 465
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vse8.v v24, (a1)
+; CHECK-NEXT:    lbu a1, 985(sp)
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-NEXT:    vslideup.vx v8, v0, a3
+; CHECK-NEXT:    li a2, 478
+; CHECK-NEXT:    lbu a3, 1012(sp)
 ; CHECK-NEXT:    vmv.s.x v24, a1
+; CHECK-NEXT:    li a1, 477
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a1
 ; CHECK-NEXT:    li a1, 501
+; CHECK-NEXT:    vmv.s.x v24, a3
 ; CHECK-NEXT:    li a2, 500
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v24, a2
 ; CHECK-NEXT:    lui a1, 2761
-; CHECK-NEXT:    slli a1, a1, 25
-; CHECK-NEXT:    addi a1, a1, 501
-; CHECK-NEXT:    slli a1, a1, 13
-; CHECK-NEXT:    addi a1, a1, 512
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    lui a2, 4
+; CHECK-NEXT:    vmv.s.x v25, a2
 ; CHECK-NEXT:    lui a2, 1047552
 ; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    slli a2, a2, 23
 ; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    slli a2, a2, 18
 ; CHECK-NEXT:    vslide1down.vx v0, v24, a2
-; CHECK-NEXT:    lui a2, 4
-; CHECK-NEXT:    vmv.s.x v24, a2
 ; CHECK-NEXT:    li a2, 64
+; CHECK-NEXT:    slli a1, a1, 25
+; CHECK-NEXT:    addi a1, a1, 501
+; CHECK-NEXT:    slli a1, a1, 13
+; CHECK-NEXT:    addi a1, a1, 512
 ; CHECK-NEXT:    vsetivli zero, 7, e64, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v24, 6
+; CHECK-NEXT:    vslideup.vi v0, v25, 6
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
index 2808ca3fd2621b..bfc43db2e369ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
@@ -6,11 +6,11 @@ define <1 x i1> @v1i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v1i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -26,11 +26,11 @@ define <2 x i1> @v2i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -50,11 +50,11 @@ define <4 x i1> @v4i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v4i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -74,11 +74,11 @@ define <8 x i1> @v8i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v8i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -98,11 +98,11 @@ define <16 x i1> @v16i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -123,10 +123,10 @@ define <32 x i1> @v32i1(i1 %x, i1 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vmsne.vi v10, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmsne.vi v11, v8, 0
 ; CHECK-NEXT:    vmxor.mm v0, v10, v11
@@ -148,10 +148,10 @@ define <64 x i1> @v64i1(i1 %x, i1 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a2, 64
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vmsne.vi v12, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmsne.vi v13, v8, 0
 ; CHECK-NEXT:    vmxor.mm v0, v12, v13
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index c42fabd78aabf7..3eb5d36b4896a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -11,18 +11,18 @@ define <2 x i8> @vp_bitreverse_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -37,20 +37,20 @@ define <2 x i8> @vp_bitreverse_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -65,18 +65,18 @@ define <4 x i8> @vp_bitreverse_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -91,20 +91,20 @@ define <4 x i8> @vp_bitreverse_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -119,18 +119,18 @@ define <8 x i8> @vp_bitreverse_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -145,20 +145,20 @@ define <8 x i8> @vp_bitreverse_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -173,18 +173,18 @@ define <16 x i8> @vp_bitreverse_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -199,20 +199,20 @@ define <16 x i8> @vp_bitreverse_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -227,25 +227,25 @@ define <2 x i16> @vp_bitreverse_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -261,26 +261,26 @@ define <2 x i16> @vp_bitreverse_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -295,25 +295,25 @@ define <4 x i16> @vp_bitreverse_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -329,26 +329,26 @@ define <4 x i16> @vp_bitreverse_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -363,25 +363,25 @@ define <8 x i16> @vp_bitreverse_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -397,26 +397,26 @@ define <8 x i16> @vp_bitreverse_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -431,25 +431,25 @@ define <16 x i16> @vp_bitreverse_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -465,26 +465,26 @@ define <16 x i16> @vp_bitreverse_v16i16_unmasked(<16 x i16> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -505,27 +505,27 @@ define <2 x i32> @vp_bitreverse_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -541,34 +541,34 @@ define <2 x i32> @vp_bitreverse_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -589,27 +589,27 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -625,34 +625,34 @@ define <4 x i32> @vp_bitreverse_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -673,27 +673,27 @@ define <8 x i32> @vp_bitreverse_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -709,34 +709,34 @@ define <8 x i32> @vp_bitreverse_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vsll.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -757,27 +757,27 @@ define <16 x i32> @vp_bitreverse_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -793,34 +793,34 @@ define <16 x i32> @vp_bitreverse_v16i32_unmasked(<16 x i32> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vsll.vi v16, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -835,68 +835,67 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v9, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v11, v11, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vand.vx v11, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vsrl.vx v11, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v11, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v9, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v11, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -906,59 +905,59 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e
 ; RV64-LABEL: vp_bitreverse_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a7, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
 ; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    ret
@@ -972,67 +971,67 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1042,59 +1041,59 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV64-LABEL: vp_bitreverse_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vor.vv v8, v8, v9
+; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
@@ -1109,68 +1108,67 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v10, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v14, v14, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vand.vx v14, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vsrl.vx v14, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v14, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v14, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1180,59 +1178,59 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e
 ; RV64-LABEL: vp_bitreverse_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a7, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1246,67 +1244,67 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v14, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v10, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v10, v16, v10
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v14, v14, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v14, v18, v14
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v10, v14, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v10, v10, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v10, v10, v14
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1316,59 +1314,59 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV64-LABEL: vp_bitreverse_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v14, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v10, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v10, v16, v10
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v14, v14, a2
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v14, v8, a2
+; RV64-NEXT:    vsll.vi v14, v14, 8
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
@@ -1383,70 +1381,69 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v28, a4
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v12, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vor.vv v20, v8, v20, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 4, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v28, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v28, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
+; RV32-NEXT:    vor.vv v16, v20, v16, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v12, v0.t
+; RV32-NEXT:    vand.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vor.vv v12, v20, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -1454,59 +1451,59 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 ; RV64-LABEL: vp_bitreverse_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a7, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1520,67 +1517,67 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v20, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v12, v24, v12
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v20, v20, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v20, v28, v20
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v12, v20, v12
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v12, v12, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v12, v12, v20
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1590,59 +1587,59 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV64-LABEL: vp_bitreverse_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
+; RV64-NEXT:    vsrl.vi v16, v8, 24
+; RV64-NEXT:    vsrl.vi v20, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v12, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v12, v24, v12
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v20, v20, a2
 ; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v20, v8, a2
+; RV64-NEXT:    vsll.vi v20, v20, 8
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
@@ -1662,116 +1659,117 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    vmv8r.v v24, v8
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    lui a5, 349525
 ; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    addi a2, a3, -241
+; RV32-NEXT:    sw a2, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a3, a4, 819
+; RV32-NEXT:    sw a3, 32(sp)
+; RV32-NEXT:    sw a3, 36(sp)
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    addi a4, a5, 1365
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a5, a6, -256
+; RV32-NEXT:    sw a4, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a1, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a2, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -1790,66 +1788,65 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -1869,91 +1866,98 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vx v16, v8, a5
+; RV32-NEXT:    vsrl.vx v24, v8, a5
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a1, a2, -241
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, a3, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    addi a3, a4, 1365
+; RV32-NEXT:    addi a4, a6, -256
+; RV32-NEXT:    vsrl.vx v0, v8, a5
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a3, 28(sp)
+; RV32-NEXT:    vand.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a4
+; RV32-NEXT:    vsll.vx v0, v0, a5
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v0, v0, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v0, v8, v24
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1962,62 +1966,78 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_bitreverse_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
@@ -2035,116 +2055,117 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    vmv8r.v v24, v8
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    lui a5, 349525
 ; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    addi a2, a3, -241
+; RV32-NEXT:    sw a2, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a3, a4, 819
+; RV32-NEXT:    sw a3, 32(sp)
+; RV32-NEXT:    sw a3, 36(sp)
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    addi a4, a5, 1365
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a5, a6, -256
+; RV32-NEXT:    sw a4, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a1, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a2, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2163,66 +2184,65 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2242,91 +2262,98 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vx v16, v8, a5
+; RV32-NEXT:    vsrl.vx v24, v8, a5
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a1, a2, -241
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, a3, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    addi a3, a4, 1365
+; RV32-NEXT:    addi a4, a6, -256
+; RV32-NEXT:    vsrl.vx v0, v8, a5
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a3, 28(sp)
+; RV32-NEXT:    vand.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a4
+; RV32-NEXT:    vsll.vx v0, v0, a5
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v0, v0, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v0, v8, v24
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -2335,62 +2362,78 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_bitreverse_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.bitreverse.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
@@ -2422,58 +2465,58 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    lui a2, 3
+; CHECK-NEXT:    addi a3, a0, -64
+; CHECK-NEXT:    sltu a0, a0, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a4, a1, -241
+; CHECK-NEXT:    addi a1, a2, 819
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    lui a2, 3
-; CHECK-NEXT:    addi a2, a2, 819
-; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a3, 5
-; CHECK-NEXT:    addi a3, a3, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a4, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a4, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a4
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -2501,49 +2544,53 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vsrl.vi v24, v8, 4
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v24, v24, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v24, v8
-; CHECK-NEXT:    vsrl.vi v24, v8, 2
-; CHECK-NEXT:    lui a2, 3
-; CHECK-NEXT:    addi a2, a2, 819
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v24, v8
-; CHECK-NEXT:    vsrl.vi v24, v8, 1
-; CHECK-NEXT:    lui a3, 5
-; CHECK-NEXT:    addi a3, a3, 1365
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v8, v8, a3
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    lui a3, 3
 ; CHECK-NEXT:    addi a4, a0, -64
 ; CHECK-NEXT:    sltu a0, a0, a4
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    lui a4, 5
+; CHECK-NEXT:    vor.vv v8, v8, v24
+; CHECK-NEXT:    addi a2, a2, -241
+; CHECK-NEXT:    addi a3, a3, 819
+; CHECK-NEXT:    addi a4, a4, 1365
+; CHECK-NEXT:    vsrl.vi v24, v8, 4
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
 ; CHECK-NEXT:    vor.vv v16, v16, v24
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v24, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a3
+; CHECK-NEXT:    vand.vx v24, v24, a3
+; CHECK-NEXT:    vsll.vi v8, v8, 2
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 4
-; CHECK-NEXT:    vand.vx v24, v24, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
+; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
 ; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vor.vv v16, v24, v16
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v24, v8, 1
+; CHECK-NEXT:    vand.vx v8, v8, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 2
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v16, v16, a3
+; CHECK-NEXT:    vand.vx v24, v24, a3
 ; CHECK-NEXT:    vsll.vi v16, v16, 2
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    vsrl.vi v24, v16, 1
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v16, v16, a3
+; CHECK-NEXT:    vand.vx v16, v16, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
 ; CHECK-NEXT:    vadd.vv v16, v16, v16
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index 584f8520ab62fa..946ca4d1ab904a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -9,28 +9,28 @@ define void @bitreverse_v8i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 3
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 5
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a1, 5
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -56,36 +56,36 @@ define void @bitreverse_v4i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
-; CHECK-NEXT:    vsll.vi v10, v10, 8
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 209715
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 349525
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a1, 349525
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -113,65 +113,65 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a2, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    addi a2, a5, -256
+; RV32-NEXT:    vlse64.v v9, (a6), zero
+; RV32-NEXT:    vsrl.vx v10, v8, a3
+; RV32-NEXT:    vsrl.vx v11, v8, a4
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsll.vx v13, v8, a3
+; RV32-NEXT:    vand.vx v11, v11, a2
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a2
+; RV32-NEXT:    vsll.vx v11, v11, a4
+; RV32-NEXT:    vor.vv v11, v13, v11
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vv v13, v13, v9
+; RV32-NEXT:    vor.vv v12, v13, v12
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v8, v9
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v9, v9, 8
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v9, a3
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v11, a4
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v10, v9
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
@@ -184,58 +184,58 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v9, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v10, v10, a3
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v10, v10, a4
-; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v11, v11, a5
-; RV64-NEXT:    vor.vv v10, v11, v10
+; RV64-NEXT:    vsrl.vx v9, v8, a1
+; RV64-NEXT:    vsrl.vx v10, v8, a2
+; RV64-NEXT:    vsrl.vi v11, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vand.vx v10, v8, a5
+; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vand.vx v12, v12, a5
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a4
 ; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 24
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a1
 ; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v11, v8
+; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vor.vv v9, v11, v9
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v9, v9, a3
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -261,28 +261,28 @@ define void @bitreverse_v16i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 3
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 5
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a1, 5
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -308,36 +308,36 @@ define void @bitreverse_v8i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a1
-; CHECK-NEXT:    vsll.vi v12, v12, 8
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v12, v12, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 209715
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 349525
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a1, 349525
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -365,65 +365,65 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a2, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a4
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    addi a2, a5, -256
+; RV32-NEXT:    vlse64.v v10, (a6), zero
+; RV32-NEXT:    vsrl.vx v12, v8, a3
+; RV32-NEXT:    vsrl.vx v14, v8, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vsll.vx v18, v8, a3
+; RV32-NEXT:    vand.vx v14, v14, a2
+; RV32-NEXT:    vor.vv v14, v14, v12
+; RV32-NEXT:    vand.vx v12, v8, a2
+; RV32-NEXT:    vsll.vx v12, v12, a4
+; RV32-NEXT:    vor.vv v12, v18, v12
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vv v18, v18, v10
+; RV32-NEXT:    vor.vv v16, v18, v16
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a4, a4, 1365
+; RV32-NEXT:    vor.vv v14, v16, v14
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v8, v10
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a3
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v12, a4
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v14
+; RV32-NEXT:    vsrl.vi v14, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v14, v14, v16
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v14, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v14, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
@@ -434,60 +434,60 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; RV64-LABEL: bitreverse_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vle64.v v14, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v12, v12, a3
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v12, v12, a4
-; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v14, v14, a5
-; RV64-NEXT:    vor.vv v12, v14, v12
+; RV64-NEXT:    vsrl.vx v8, v14, a1
+; RV64-NEXT:    vsrl.vx v10, v14, a2
+; RV64-NEXT:    vsrl.vi v12, v14, 24
+; RV64-NEXT:    vsrl.vi v16, v14, 8
+; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vand.vx v18, v14, a5
+; RV64-NEXT:    vand.vx v10, v12, a4
+; RV64-NEXT:    vand.vx v12, v16, a5
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vand.vx v12, v8, a5
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 24
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsll.vx v14, v8, a1
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v14, v8
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vand.vx v12, v14, a4
+; RV64-NEXT:    vsll.vi v16, v18, 8
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vor.vv v12, v12, v16
+; RV64-NEXT:    vsll.vx v16, v14, a1
+; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    vsll.vx v14, v14, a2
+; RV64-NEXT:    vor.vv v14, v16, v14
 ; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 349525
 ; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v10, v14, v12
+; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index 8bfdf9b6884a27..d765e4c0b8f6a9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -136,9 +136,9 @@ define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -178,9 +178,9 @@ define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -220,9 +220,9 @@ define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsll.vi v12, v12, 8
@@ -262,9 +262,9 @@ define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
@@ -284,38 +284,38 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v11, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -324,31 +324,31 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
@@ -363,39 +363,39 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -403,34 +403,34 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i64> %v
@@ -444,38 +444,38 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v14, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -484,31 +484,31 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a5, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
-; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
@@ -523,39 +523,39 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v14, v16, v14
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v10, v10, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v10, v18, v10
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -563,34 +563,34 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
+; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v14, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v12, v8, a2
+; RV64-NEXT:    vsll.vi v12, v12, 8
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v10, v10, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -604,34 +604,34 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
@@ -644,31 +644,31 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
@@ -683,39 +683,39 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v20, v24, v20
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v12, v28, v12
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -723,34 +723,34 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v20, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
-; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    vor.vv v12, v12, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i64> %v
@@ -769,33 +769,33 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -814,14 +814,14 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -857,36 +857,35 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -909,51 +908,59 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -962,35 +969,51 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_bswap_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
@@ -1009,33 +1032,33 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1054,14 +1077,14 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1097,36 +1120,35 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -1149,51 +1171,59 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1202,35 +1232,51 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_bswap_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 1dff8aed060541..5e491f21e62134 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -35,15 +35,15 @@ define void @bswap_v4i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
-; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -72,36 +72,36 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a1, a4, -256
+; RV32-NEXT:    vlse64.v v9, (a6), zero
 ; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vsrl.vx v11, v8, a3
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsll.vx v13, v8, a2
+; RV32-NEXT:    vand.vx v11, v11, a1
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a1
+; RV32-NEXT:    vsll.vx v11, v11, a3
+; RV32-NEXT:    vor.vv v11, v13, v11
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vv v13, v13, v9
+; RV32-NEXT:    vor.vv v12, v13, v12
+; RV32-NEXT:    vand.vv v9, v8, v9
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v9, v9, 8
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -112,31 +112,31 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v9, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v10, v10, a3
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v10, v10, a4
-; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v11, v11, a5
-; RV64-NEXT:    vor.vv v10, v11, v10
+; RV64-NEXT:    vsrl.vx v9, v8, a1
+; RV64-NEXT:    vsrl.vx v10, v8, a2
+; RV64-NEXT:    vsrl.vi v11, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vand.vx v10, v8, a5
+; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vand.vx v12, v12, a5
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a4
 ; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 24
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a1
 ; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v11, v8
+; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    vor.vv v9, v11, v9
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -188,15 +188,15 @@ define void @bswap_v8i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a1
-; CHECK-NEXT:    vsll.vi v12, v12, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v12, v12, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -225,36 +225,36 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a1, a4, -256
+; RV32-NEXT:    vlse64.v v10, (a6), zero
 ; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
+; RV32-NEXT:    vsrl.vx v14, v8, a3
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vsll.vx v18, v8, a2
+; RV32-NEXT:    vand.vx v14, v14, a1
+; RV32-NEXT:    vor.vv v12, v14, v12
+; RV32-NEXT:    vand.vx v14, v8, a1
+; RV32-NEXT:    vsll.vx v14, v14, a3
+; RV32-NEXT:    vor.vv v14, v18, v14
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vand.vv v18, v18, v10
+; RV32-NEXT:    vor.vv v16, v18, v16
+; RV32-NEXT:    vand.vv v10, v8, v10
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a4
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -265,31 +265,31 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v12, v12, a3
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v12, v12, a4
-; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v14, v14, a5
-; RV64-NEXT:    vor.vv v12, v14, v12
+; RV64-NEXT:    vsrl.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vx v12, v8, a2
+; RV64-NEXT:    vsrl.vi v14, v8, 24
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    vand.vx v12, v12, a3
 ; RV64-NEXT:    vor.vv v10, v12, v10
 ; RV64-NEXT:    vand.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v14, v14, a4
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vand.vx v16, v8, a4
 ; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 24
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsll.vx v14, v8, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsll.vx v16, v8, a1
 ; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v14, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vor.vv v10, v14, v10
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index 5d75efe681af73..b94a523e130440 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -31,12 +31,12 @@ define <8 x i32> @add_constant_rhs_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
@@ -118,22 +118,22 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vslide1down.vx v8, v8, a2
-; CHECK-NEXT:    vslide1down.vx v8, v8, a3
-; CHECK-NEXT:    vmulhu.vv v9, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vslide1down.vx v11, v11, a0
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_1)
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vmulhu.vv v10, v10, v11
-; CHECK-NEXT:    vadd.vv v9, v10, v9
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vle32.v v11, (a0)
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vmulhu.vv v10, v8, v10
+; CHECK-NEXT:    vsub.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v9, v12, v9
+; CHECK-NEXT:    vadd.vv v9, v9, v10
 ; CHECK-NEXT:    vmv.v.i v0, 4
-; CHECK-NEXT:    vsrl.vv v9, v9, v12
+; CHECK-NEXT:    vsrl.vv v9, v9, v11
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %e0 = udiv i32 %a, 23
@@ -224,12 +224,12 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d)
 ; RV32-NEXT:    addi a1, a1, 25
 ; RV32-NEXT:    addi a2, a2, 1
 ; RV32-NEXT:    addi a3, a3, 2047
-; RV32-NEXT:    addi a3, a3, 308
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    addi a0, a3, 308
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_constant_rhs_with_identity:
@@ -237,12 +237,12 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d)
 ; RV64-NEXT:    addiw a1, a1, 25
 ; RV64-NEXT:    addiw a2, a2, 1
 ; RV64-NEXT:    addi a3, a3, 2047
-; RV64-NEXT:    addiw a3, a3, 308
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    addiw a0, a3, 308
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %e0 = add i32 %a, 0
   %e1 = add i32 %b, 25
@@ -261,12 +261,12 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; RV32-NEXT:    addi a1, a1, 25
 ; RV32-NEXT:    addi a2, a2, 1
 ; RV32-NEXT:    addi a3, a3, 2047
-; RV32-NEXT:    addi a3, a3, 308
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    addi a0, a3, 308
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_constant_rhs_identity:
@@ -274,12 +274,12 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; RV64-NEXT:    addiw a1, a1, 25
 ; RV64-NEXT:    addiw a2, a2, 1
 ; RV64-NEXT:    addi a3, a3, 2047
-; RV64-NEXT:    addiw a3, a3, 308
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    addiw a0, a3, 308
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %e0 = add i32 %a, 0
   %e1 = add i32 %b, 25
@@ -502,17 +502,17 @@ define <8 x i32> @add_constant_rhs_8xi32_vector_in2(<8 x i32> %vin, i32 %a, i32
 ; CHECK-NEXT:    addi a1, a1, 25
 ; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    addi a3, a3, 2047
-; CHECK-NEXT:    addi a3, a3, 308
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    addi a0, a3, 308
 ; CHECK-NEXT:    vmv.s.x v10, a2
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 6
-; CHECK-NEXT:    vmv.s.x v10, a3
+; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 7
 ; CHECK-NEXT:    ret
@@ -534,15 +534,15 @@ define <8 x i32> @add_constant_rhs_8xi32_vector_in3(<8 x i32> %vin, i32 %a, i32
 ; CHECK-NEXT:    addi a1, a1, 25
 ; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    addi a3, a3, 2047
-; CHECK-NEXT:    addi a3, a3, 308
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    addi a0, a3, 308
 ; CHECK-NEXT:    vmv.s.x v10, a2
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vmv.s.x v10, a3
+; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 6
 ; CHECK-NEXT:    ret
@@ -562,20 +562,21 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vmv.s.x v12, a1
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 5
 ; CHECK-NEXT:    vmv.s.x v10, a2
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 6
 ; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 5
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vmv.s.x v12, a3
-; CHECK-NEXT:    vslideup.vi v8, v12, 7
-; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 6
+; CHECK-NEXT:    vmv.s.x v10, a3
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 7
+; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vadd = add <8 x i32> %vin, <i32 1, i32 2, i32 3, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
   %e0 = add i32 %a, 23
@@ -598,9 +599,9 @@ define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    slli a1, a1, 31
 ; RV32-NEXT:    srli a0, a0, 1
-; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    slli a3, a3, 31
 ; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    or a2, a2, a3
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
index 3c090bb9003119..ee953a66a004f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
@@ -85,16 +85,16 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a1, 256
-; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    addi a1, a1, 384
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    addi a2, a1, 384
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vle32.v v0, (a2)
-; CHECK-NEXT:    vse32.v v16, (a0)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vse32.v v24, (a1)
-; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vle32.v v0, (a2)
+; CHECK-NEXT:    addi a2, a0, 256
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vse32.v v16, (a2)
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <128 x i32>, ptr %x
@@ -257,9 +257,7 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
 ; CHECK-NEXT:    andi sp, sp, -128
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    mv a0, sp
+; CHECK-NEXT:    mv t0, sp
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -268,8 +266,10 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    mv t3, sp
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    li t4, 8
-; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    vse32.v v8, (t0)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_indirect_stack
@@ -306,19 +306,17 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3
 define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) {
 ; CHECK-LABEL: pass_vector_arg_direct_stack:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -160
-; CHECK-NEXT:    .cfi_def_cfa_offset 160
-; CHECK-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 160(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
+; CHECK-NEXT:    .cfi_offset s0, -16
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    sd a0, 144(sp)
-; CHECK-NEXT:    li a0, 13
-; CHECK-NEXT:    li t0, 12
+; CHECK-NEXT:    addi t0, sp, 16
+; CHECK-NEXT:    li t1, 1
+; CHECK-NEXT:    li t2, 13
+; CHECK-NEXT:    li s0, 12
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -327,17 +325,23 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32>
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    li t3, 8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vse32.v v8, (t0)
 ; CHECK-NEXT:    li t4, 9
 ; CHECK-NEXT:    li t5, 10
+; CHECK-NEXT:    sd t1, 144(sp)
 ; CHECK-NEXT:    li t6, 11
-; CHECK-NEXT:    sd t0, 0(sp)
-; CHECK-NEXT:    sd a0, 8(sp)
+; CHECK-NEXT:    sd s0, 0(sp)
+; CHECK-NEXT:    sd t2, 8(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_direct_stack
-; CHECK-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld ra, 168(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 160(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_restore ra
-; CHECK-NEXT:    addi sp, sp, 160
+; CHECK-NEXT:    .cfi_restore s0
+; CHECK-NEXT:    addi sp, sp, 176
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
index fcdb5d5cb6aef7..73e148edbe2d67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -85,16 +85,16 @@ define <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a1, 256
-; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    addi a1, a1, 384
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    addi a2, a1, 384
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vle32.v v0, (a2)
-; CHECK-NEXT:    vse32.v v16, (a0)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vse32.v v24, (a1)
-; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vle32.v v0, (a2)
+; CHECK-NEXT:    addi a2, a0, 256
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vse32.v v16, (a2)
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <128 x i32>, ptr %x
@@ -312,18 +312,18 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x
 ; CHECK-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vse32.v v8, (sp)
-; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    li t0, 8
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
 ; CHECK-NEXT:    li a4, 4
 ; CHECK-NEXT:    li a5, 5
 ; CHECK-NEXT:    li a6, 6
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vse32.v v8, (sp)
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    sd a0, 128(sp)
+; CHECK-NEXT:    sd t0, 128(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_via_stack
@@ -358,25 +358,27 @@ define <4 x i1> @pass_vector_mask_arg_via_stack(<4 x i1> %v) {
 ; CHECK-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vse32.v v8, (sp)
-; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    sd a0, 128(sp)
+; CHECK-NEXT:    li a1, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v17, 0
+; CHECK-NEXT:    addi a2, sp, 136
+; CHECK-NEXT:    li a5, 5
+; CHECK-NEXT:    li a6, 6
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    sd a1, 128(sp)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (sp)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vmv.v.v v17, v16
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v16, v17, 0
-; CHECK-NEXT:    addi a0, sp, 136
-; CHECK-NEXT:    li a5, 5
-; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    vsm.v v16, (a0)
+; CHECK-NEXT:    vsm.v v16, (a2)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    li a2, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 087e55f904e8f9..511242aa677c2a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -72,10 +72,10 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -154,10 +154,10 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -238,10 +238,10 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -283,10 +283,10 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -324,10 +324,10 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 3
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 3
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 3
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index e2d7ed55c46016..9d0d42cf754c5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -11,6 +11,7 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -18,10 +19,9 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -39,6 +39,7 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -46,10 +47,9 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -69,6 +69,7 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -76,10 +77,9 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -97,6 +97,7 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -104,10 +105,9 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -127,6 +127,7 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -134,10 +135,9 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -155,6 +155,7 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -162,10 +163,9 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -185,6 +185,7 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -192,10 +193,9 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -213,6 +213,7 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -220,10 +221,9 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -243,7 +243,9 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -252,20 +254,18 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -280,7 +280,9 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -289,20 +291,18 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -319,7 +319,9 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -328,20 +330,18 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -356,7 +356,9 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -365,20 +367,18 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -395,7 +395,9 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -404,20 +406,18 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -432,7 +432,9 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -441,20 +443,18 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -471,7 +471,9 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -480,20 +482,18 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -508,7 +508,9 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -517,20 +519,18 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -547,7 +547,9 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -558,20 +560,18 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -587,7 +587,9 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -598,20 +600,18 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -629,7 +629,9 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -640,20 +642,18 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -669,7 +669,9 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -680,20 +682,18 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -711,7 +711,9 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -722,20 +724,18 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -751,7 +751,9 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -762,20 +764,18 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -793,7 +793,9 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
@@ -804,20 +806,18 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -833,7 +833,9 @@ define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
@@ -844,20 +846,18 @@ define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -875,6 +875,12 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -884,49 +890,60 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -936,38 +953,21 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v9, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl)
@@ -979,6 +979,12 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -988,40 +994,34 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1031,6 +1031,23 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -1040,37 +1057,20 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
+; RV64-NEXT:    vsrl.vx v9, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1084,59 +1084,76 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
@@ -1146,38 +1163,21 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl)
@@ -1189,6 +1189,12 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -1198,40 +1204,34 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1241,6 +1241,23 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -1250,37 +1267,20 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsrl.vx v10, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1294,59 +1294,76 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v8i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -1356,38 +1373,21 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl)
@@ -1399,6 +1399,12 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -1408,40 +1414,34 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1451,6 +1451,23 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -1460,37 +1477,20 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    vsrl.vx v12, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1522,11 +1522,21 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1536,58 +1546,52 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1601,6 +1605,23 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1610,38 +1631,21 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
@@ -1666,46 +1670,48 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1717,6 +1723,23 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -1726,37 +1749,20 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1788,11 +1794,21 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1802,58 +1818,52 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1867,6 +1877,23 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1876,38 +1903,21 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
@@ -1932,46 +1942,48 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1983,6 +1995,23 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -1992,37 +2021,20 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2050,29 +2062,32 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    addi a4, sp, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2082,37 +2097,31 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a3, a3, a5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -2373,6 +2382,28 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a6, a4, -241
+; RV64-NEXT:    addiw a7, a5, 257
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a5, a2, a5
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a4, a3, a4
+; RV64-NEXT:    slli a2, a6, 32
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    slli a3, a7, 32
+; RV64-NEXT:    add a3, a7, a3
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2382,52 +2413,30 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a1, 32
 ; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    addi a7, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a7, vlenb
 ; RV64-NEXT:    slli a7, a7, 3
 ; RV64-NEXT:    add a7, sp, a7
 ; RV64-NEXT:    addi a7, a7, 16
 ; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
@@ -2442,17 +2451,17 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2475,113 +2484,144 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:    bltu a0, a3, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a0, -16
+; RV32-NEXT:    sltu a0, a0, a3
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v16, a2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v0, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v8
 ; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 8
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vx v0, v24, a2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v24
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsub.vv v0, v16, v0
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    li a2, 56
@@ -2607,78 +2647,100 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:  .LBB35_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a7, a3, 1365
+; RV64-NEXT:    addiw a3, a4, 819
+; RV64-NEXT:    addiw a4, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a7, a7, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a3, a4, 32
+; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a4, a6, 32
+; RV64-NEXT:    add a4, a6, a4
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v24, v8, a1
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
 ; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a7
+; RV64-NEXT:    vsub.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a5
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a5
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vmul.vx v16, v16, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a0
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
@@ -2689,6 +2751,7 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2696,10 +2759,9 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2717,6 +2779,7 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2724,10 +2787,9 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2745,6 +2807,7 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2752,10 +2815,9 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2773,6 +2835,7 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2780,10 +2843,9 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2801,6 +2863,7 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2808,10 +2871,9 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2829,6 +2891,7 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2836,10 +2899,9 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2857,6 +2919,7 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2864,10 +2927,9 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2885,6 +2947,7 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2892,10 +2955,9 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2913,7 +2975,9 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -2922,20 +2986,18 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2950,7 +3012,9 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -2959,20 +3023,18 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2987,7 +3049,9 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -2996,20 +3060,18 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3024,7 +3086,9 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3033,20 +3097,18 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3061,7 +3123,9 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -3070,20 +3134,18 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3098,7 +3160,9 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3107,20 +3171,18 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3135,7 +3197,9 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -3144,20 +3208,18 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3172,7 +3234,9 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -3181,20 +3245,18 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3209,7 +3271,9 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -3220,20 +3284,18 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3249,7 +3311,9 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3260,20 +3324,18 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3289,7 +3351,9 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -3300,20 +3364,18 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3329,7 +3391,9 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3340,20 +3404,18 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3369,7 +3431,9 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -3380,20 +3444,18 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3409,7 +3471,9 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -3420,20 +3484,18 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3449,7 +3511,9 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
@@ -3460,20 +3524,18 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3489,7 +3551,9 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
@@ -3500,20 +3564,18 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3529,6 +3591,12 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -3538,49 +3606,60 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -3590,38 +3669,21 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v9, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl)
@@ -3633,6 +3695,12 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -3642,40 +3710,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3685,6 +3747,23 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -3694,37 +3773,20 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
+; RV64-NEXT:    vsrl.vx v9, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3736,59 +3798,76 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-LABEL: vp_ctlz_zero_undef_v4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
@@ -3798,38 +3877,21 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl)
@@ -3840,7 +3902,13 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -3850,40 +3918,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3893,6 +3955,23 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -3902,37 +3981,20 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsrl.vx v10, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3944,59 +4006,76 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-LABEL: vp_ctlz_zero_undef_v8i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -4006,38 +4085,21 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl)
@@ -4049,6 +4111,12 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -4058,40 +4126,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4101,6 +4163,23 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -4110,37 +4189,20 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    vsrl.vx v12, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4170,11 +4232,21 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4184,58 +4256,52 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -4249,6 +4315,23 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4258,38 +4341,21 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
@@ -4314,46 +4380,48 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4365,6 +4433,23 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -4374,37 +4459,20 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4434,11 +4502,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4448,58 +4526,52 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -4513,6 +4585,23 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4522,38 +4611,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
@@ -4578,46 +4650,48 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4629,6 +4703,23 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -4638,37 +4729,20 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4694,29 +4768,32 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB70_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    addi a4, sp, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4726,37 +4803,31 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a3, a3, a5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -5017,6 +5088,28 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a6, a4, -241
+; RV64-NEXT:    addiw a7, a5, 257
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a5, a2, a5
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a4, a3, a4
+; RV64-NEXT:    slli a2, a6, 32
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    slli a3, a7, 32
+; RV64-NEXT:    add a3, a7, a3
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -5026,52 +5119,30 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a1, 32
 ; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    addi a7, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a7, vlenb
 ; RV64-NEXT:    slli a7, a7, 3
 ; RV64-NEXT:    add a7, sp, a7
 ; RV64-NEXT:    addi a7, a7, 16
 ; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
@@ -5086,17 +5157,17 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -5119,113 +5190,144 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB71_2
+; RV32-NEXT:    bltu a0, a3, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a0, -16
+; RV32-NEXT:    sltu a0, a0, a3
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v16, a2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v0, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v8
 ; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 8
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vx v0, v24, a2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v24
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsub.vv v0, v16, v0
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    li a2, 56
@@ -5251,78 +5353,100 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:  .LBB71_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a7, a3, 1365
+; RV64-NEXT:    addiw a3, a4, 819
+; RV64-NEXT:    addiw a4, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a7, a7, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a3, a4, 32
+; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a4, a6, 32
+; RV64-NEXT:    add a4, a6, a4
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v24, v8, a1
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
 ; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a7
+; RV64-NEXT:    vsub.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a5
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a5
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vmul.vx v16, v16, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a0
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 228a9f0d6d5221..4bd4a9a854f365 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -13,6 +13,7 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -21,10 +22,9 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -39,12 +39,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vzext.vf2 v10, v8
 ; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
 ; RVF-NEXT:    vnsrl.wi v8, v12, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVF-NEXT:    vnsrl.wi v10, v8, 0
-; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vrsub.vx v8, v10, a1
 ; RVF-NEXT:    li a1, 8
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -55,12 +55,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vzext.vf2 v10, v8
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
 ; RVD-NEXT:    vnsrl.wi v8, v12, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVD-NEXT:    vnsrl.wi v10, v8, 0
-; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vrsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 8
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -87,6 +87,8 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -97,20 +99,18 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -122,9 +122,9 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
+; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVF-NEXT:    vnsrl.wi v8, v10, 23
-; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -135,9 +135,9 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
+; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    vnsrl.wi v8, v10, 23
-; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -164,6 +164,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -176,20 +178,18 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -205,8 +205,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    li a1, 158
+; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -217,8 +217,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    vnsrl.wx v8, v10, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -247,50 +247,50 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v9, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -301,6 +301,23 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
@@ -311,37 +328,20 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vx v9, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -401,6 +401,7 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    li a1, 32
 ; RVI-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -409,10 +410,9 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -428,12 +428,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vzext.vf2 v12, v8
 ; RVF-NEXT:    vfwcvt.f.xu.v v16, v12
 ; RVF-NEXT:    vnsrl.wi v8, v16, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVF-NEXT:    vnsrl.wi v12, v8, 0
-; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vrsub.vx v8, v12, a1
 ; RVF-NEXT:    li a1, 8
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -445,12 +445,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vzext.vf2 v12, v8
 ; RVD-NEXT:    vfwcvt.f.xu.v v16, v12
 ; RVD-NEXT:    vnsrl.wi v8, v16, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVD-NEXT:    vnsrl.wi v12, v8, 0
-; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vrsub.vx v8, v12, a1
 ; RVD-NEXT:    li a1, 8
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -478,6 +478,8 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -488,20 +490,18 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -513,9 +513,9 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
+; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVF-NEXT:    vnsrl.wi v8, v12, 23
-; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -526,9 +526,9 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
+; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    vnsrl.wi v8, v12, 23
-; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -555,6 +555,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -567,20 +569,18 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -596,8 +596,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    li a1, 158
+; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -608,8 +608,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    vnsrl.wx v8, v12, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -638,50 +638,50 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v10, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -692,6 +692,23 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
@@ -702,37 +719,20 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vx v10, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -791,6 +791,7 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -799,10 +800,9 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -860,6 +860,8 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -870,20 +872,18 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -932,6 +932,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -944,20 +946,18 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -983,8 +983,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    vnsrl.wx v8, v10, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -1010,50 +1010,50 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v9, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1064,6 +1064,23 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
@@ -1074,37 +1091,20 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vx v9, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1158,6 +1158,7 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    li a1, 32
 ; RVI-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -1166,10 +1167,9 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -1230,6 +1230,8 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -1240,20 +1242,18 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -1302,6 +1302,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -1314,20 +1316,18 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -1353,8 +1353,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    vnsrl.wx v8, v12, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -1380,50 +1380,50 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v10, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1434,6 +1434,23 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
@@ -1444,37 +1461,20 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vx v10, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index 35a1822337f4df..5e73e6df9170c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -13,8 +13,8 @@ define <2 x i8> @vp_ctpop_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -34,8 +34,8 @@ define <2 x i8> @vp_ctpop_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -57,8 +57,8 @@ define <4 x i8> @vp_ctpop_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -78,8 +78,8 @@ define <4 x i8> @vp_ctpop_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -101,8 +101,8 @@ define <8 x i8> @vp_ctpop_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -122,8 +122,8 @@ define <8 x i8> @vp_ctpop_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -145,8 +145,8 @@ define <16 x i8> @vp_ctpop_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -166,8 +166,8 @@ define <16 x i8> @vp_ctpop_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -190,17 +190,17 @@ define <2 x i16> @vp_ctpop_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -218,17 +218,17 @@ define <2 x i16> @vp_ctpop_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -248,17 +248,17 @@ define <4 x i16> @vp_ctpop_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -276,17 +276,17 @@ define <4 x i16> @vp_ctpop_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -306,17 +306,17 @@ define <8 x i16> @vp_ctpop_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -334,17 +334,17 @@ define <8 x i16> @vp_ctpop_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -364,17 +364,17 @@ define <16 x i16> @vp_ctpop_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -392,17 +392,17 @@ define <16 x i16> @vp_ctpop_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -422,17 +422,17 @@ define <2 x i32> @vp_ctpop_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -451,17 +451,17 @@ define <2 x i32> @vp_ctpop_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -482,17 +482,17 @@ define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -511,17 +511,17 @@ define <4 x i32> @vp_ctpop_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -542,17 +542,17 @@ define <8 x i32> @vp_ctpop_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -571,17 +571,17 @@ define <8 x i32> @vp_ctpop_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -602,17 +602,17 @@ define <16 x i32> @vp_ctpop_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -631,17 +631,17 @@ define <16 x i32> @vp_ctpop_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -663,67 +663,67 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v2i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl)
@@ -739,31 +739,31 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -771,34 +771,34 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -817,67 +817,67 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl)
@@ -893,31 +893,31 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -925,34 +925,34 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -971,67 +971,67 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v16, v12, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl)
@@ -1047,31 +1047,31 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1079,34 +1079,34 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1122,10 +1122,11 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1144,66 +1145,41 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1212,35 +1188,35 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v15i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
@@ -1265,28 +1241,31 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v8, v8, v16
@@ -1299,34 +1278,34 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1342,10 +1321,11 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1364,66 +1344,41 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1432,35 +1387,35 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v16i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl)
@@ -1485,28 +1440,31 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v8, v8, v16
@@ -1519,34 +1477,34 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1567,90 +1525,68 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB34_2
+; RV32-NEXT:    bltu a0, a3, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    addi a2, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
@@ -1658,7 +1594,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
@@ -1677,7 +1613,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
@@ -1692,7 +1628,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
@@ -1711,22 +1647,22 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1740,7 +1676,7 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
@@ -1785,58 +1721,58 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    addiw a2, a2, 819
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a5, a1, a5
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a6, a2, a1
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    slli a2, a4, 32
+; RV64-NEXT:    add a2, a4, a2
+; RV64-NEXT:    addi a3, a0, -16
+; RV64-NEXT:    sltu a0, a0, a3
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    addiw a3, a3, -241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addiw a4, a4, 257
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    addi a6, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 16
-; RV64-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -1856,88 +1792,141 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:    bltu a0, a3, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a2, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vlse64.v v24, (a2), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v8, v16
-; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v24, v24, 1
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v24, v16
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    addi a2, sp, 24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 4
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v24, v8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v0
+; RV32-NEXT:    vmul.vv v16, v24, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v24, v8, v0
 ; RV32-NEXT:    li a2, 56
@@ -1946,7 +1935,8 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v24, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1963,51 +1953,61 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:  .LBB35_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    addiw a2, a2, 819
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    addiw a3, a3, -241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addiw a4, a4, 257
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a5
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a4, a4, -241
+; RV64-NEXT:    addiw a5, a5, 257
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a3, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    slli a6, a5, 32
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    addi a6, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a6
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    li a6, 56
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a3
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v16, v16, a5
+; RV64-NEXT:    vsrl.vx v16, v16, a6
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index b5114bbe491896..4fbe67cfcd642a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -9,11 +9,11 @@ define void @ctpop_v16i8(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a1, 85
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a1, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
@@ -44,21 +44,21 @@ define void @ctpop_v8i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a1, 3
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
@@ -86,21 +86,21 @@ define void @ctpop_v4i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a1, 349525
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a1, 209715
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    lui a1, 4112
 ; CHECK-NEXT:    addi a1, a1, 257
@@ -133,32 +133,32 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
 ; RV32-NEXT:    vand.vv v9, v10, v9
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
@@ -169,33 +169,33 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
 ; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v8, a1
+; RV64-NEXT:    vand.vx v9, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    lui a1, 4112
-; RV64-NEXT:    addiw a1, a1, 257
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a1
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -222,11 +222,11 @@ define void @ctpop_v32i8(ptr %x, ptr %y) {
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a1, 85
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a1, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
@@ -258,21 +258,21 @@ define void @ctpop_v16i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a1, 3
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
@@ -300,21 +300,21 @@ define void @ctpop_v8i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a1, 349525
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a1, 209715
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    lui a1, 4112
 ; CHECK-NEXT:    addi a1, a1, 257
@@ -439,32 +439,32 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
 ; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
@@ -475,33 +475,33 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
 ; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vand.vx v10, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    lui a1, 4112
-; RV64-NEXT:    addiw a1, a1, 257
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a1
 ; RV64-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index d4c4ea7fee184f..cd4b19f11d1602 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -12,13 +12,13 @@ define <2 x i8> @vp_cttz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -36,14 +36,14 @@ define <2 x i8> @vp_cttz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -64,13 +64,13 @@ define <4 x i8> @vp_cttz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -88,14 +88,14 @@ define <4 x i8> @vp_cttz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -116,13 +116,13 @@ define <8 x i8> @vp_cttz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -140,14 +140,14 @@ define <8 x i8> @vp_cttz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -168,13 +168,13 @@ define <16 x i8> @vp_cttz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -192,14 +192,14 @@ define <16 x i8> @vp_cttz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -220,23 +220,23 @@ define <2 x i16> @vp_cttz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -251,24 +251,24 @@ define <2 x i16> @vp_cttz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -286,23 +286,23 @@ define <4 x i16> @vp_cttz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -317,24 +317,24 @@ define <4 x i16> @vp_cttz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -352,23 +352,23 @@ define <8 x i16> @vp_cttz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -383,24 +383,24 @@ define <8 x i16> @vp_cttz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -418,23 +418,23 @@ define <16 x i16> @vp_cttz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -449,24 +449,24 @@ define <16 x i16> @vp_cttz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -484,23 +484,23 @@ define <2 x i32> @vp_cttz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -516,24 +516,24 @@ define <2 x i32> @vp_cttz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -552,23 +552,23 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -584,24 +584,24 @@ define <4 x i32> @vp_cttz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -620,23 +620,23 @@ define <8 x i32> @vp_cttz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -652,24 +652,24 @@ define <8 x i32> @vp_cttz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -688,23 +688,23 @@ define <16 x i32> @vp_cttz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -720,24 +720,24 @@ define <16 x i32> @vp_cttz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -756,78 +756,78 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl)
@@ -839,39 +839,39 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vnot.v v9, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -880,37 +880,37 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -926,78 +926,78 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl)
@@ -1009,39 +1009,39 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vnot.v v10, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1050,37 +1050,37 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1096,78 +1096,78 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl)
@@ -1179,39 +1179,39 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vnot.v v12, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1220,37 +1220,37 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1269,6 +1269,9 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1285,59 +1288,60 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1350,38 +1354,38 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64-LABEL: vp_cttz_v15i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
@@ -1393,6 +1397,9 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -1406,36 +1413,35 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1446,37 +1452,37 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v15i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1495,6 +1501,9 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1511,59 +1520,60 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1576,38 +1586,38 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64-LABEL: vp_cttz_v16i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
@@ -1619,6 +1629,9 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -1632,36 +1645,35 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1672,37 +1684,37 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v16i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1718,50 +1730,51 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
@@ -1773,72 +1786,40 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
@@ -1855,16 +1836,13 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -1874,23 +1852,15 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
@@ -1903,7 +1873,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
@@ -1913,84 +1883,49 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2002,7 +1937,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -2026,73 +1961,73 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a4, 16
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    li a1, 1
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a7, a5, -241
+; RV64-NEXT:    addiw t0, a6, 257
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a6, a2, a6
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
+; RV64-NEXT:    addi a7, a0, -16
+; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a7, a0, a7
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 3
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2112,105 +2047,102 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:    bltu a0, a3, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v8, a2
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v8, v8, a2
+; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v0, v16, a2
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vsub.vv v0, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v0, a2
-; RV32-NEXT:    vnot.v v0, v0
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v24, v8, v0
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v16, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v24, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -2227,58 +2159,68 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:  .LBB35_2:
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vnot.v v24, v8
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    slli a7, a4, 32
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    slli a7, a5, 32
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    addi a7, a0, -16
+; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    li a7, 56
+; RV64-NEXT:    vsub.vx v8, v8, a2
+; RV64-NEXT:    vand.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v24, v16, a2
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a4
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
@@ -2290,13 +2232,13 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2314,14 +2256,14 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2340,13 +2282,13 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2364,14 +2306,14 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2390,13 +2332,13 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2414,14 +2356,14 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2440,13 +2382,13 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2464,14 +2406,14 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2490,23 +2432,23 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2521,24 +2463,24 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2554,23 +2496,23 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2585,24 +2527,24 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2618,23 +2560,23 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2649,24 +2591,24 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2682,23 +2624,23 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2713,24 +2655,24 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2746,23 +2688,23 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2778,24 +2720,24 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2812,23 +2754,23 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2844,24 +2786,24 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2878,23 +2820,23 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2910,24 +2852,24 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2944,23 +2886,23 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2976,24 +2918,24 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3010,78 +2952,78 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl)
@@ -3093,39 +3035,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vnot.v v9, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3134,37 +3076,37 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3178,78 +3120,78 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl)
@@ -3261,39 +3203,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vnot.v v10, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3302,37 +3244,37 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3346,78 +3288,78 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl)
@@ -3429,39 +3371,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vnot.v v12, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3470,37 +3412,37 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3517,6 +3459,9 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -3533,59 +3478,60 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -3598,38 +3544,38 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
@@ -3641,6 +3587,9 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -3654,36 +3603,35 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3694,37 +3642,37 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3741,6 +3689,9 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -3757,59 +3708,60 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -3822,38 +3774,38 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
@@ -3865,6 +3817,9 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -3878,36 +3833,35 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3918,37 +3872,37 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3962,50 +3916,51 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB70_2:
 ; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
@@ -4017,72 +3972,40 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
@@ -4099,16 +4022,13 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4118,23 +4038,15 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
@@ -4147,7 +4059,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
@@ -4157,84 +4069,49 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -4246,7 +4123,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -4270,73 +4147,73 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a4, 16
 ; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    li a1, 1
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a7, a5, -241
+; RV64-NEXT:    addiw t0, a6, 257
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a6, a2, a6
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
+; RV64-NEXT:    addi a7, a0, -16
+; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a7, a0, a7
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 3
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -4356,105 +4233,102 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB71_2
+; RV32-NEXT:    bltu a0, a3, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v8, a2
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v8, v8, a2
+; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v0, v16, a2
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vsub.vv v0, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v0, a2
-; RV32-NEXT:    vnot.v v0, v0
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v24, v8, v0
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v16, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v24, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -4471,58 +4345,68 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:  .LBB71_2:
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
+; RV64-NEXT:    vnot.v v24, v8
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    slli a7, a4, 32
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    slli a7, a5, 32
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
 ; RV64-NEXT:    addi a7, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a7
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    li a7, 56
+; RV64-NEXT:    vsub.vx v8, v8, a2
+; RV64-NEXT:    vand.vv v8, v24, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v24, v16, a2
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a4
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 4b1691aada5bef..57e0eeb92ee2f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -15,13 +15,13 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -36,6 +36,7 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v9, v8, v9
 ; RVF-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -44,7 +45,6 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vnsrl.wi v10, v12, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVF-NEXT:    vnsrl.wi v9, v10, 0
-; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
 ; RVF-NEXT:    vsub.vx v8, v9, a1
 ; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -55,6 +55,7 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v9, v8, v9
 ; RVD-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -63,7 +64,6 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vnsrl.wi v10, v12, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVD-NEXT:    vnsrl.wi v9, v10, 0
-; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
 ; RVD-NEXT:    vsub.vx v8, v9, a1
 ; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -92,23 +92,23 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -120,15 +120,15 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
-; RVF-NEXT:    vrsub.vi v9, v8, 0
-; RVF-NEXT:    vand.vv v9, v8, v9
-; RVF-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVF-NEXT:    vnsrl.wi v9, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v9, v9, a1
+; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vand.vv v8, v8, v9
+; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
+; RVF-NEXT:    vnsrl.wi v8, v10, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
-; RVF-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -136,15 +136,15 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
-; RVD-NEXT:    vrsub.vi v9, v8, 0
-; RVD-NEXT:    vand.vv v9, v8, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVD-NEXT:    vnsrl.wi v9, v10, 23
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v9, v9, a1
+; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vand.vv v8, v8, v9
+; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
+; RVD-NEXT:    vnsrl.wi v8, v10, 23
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
-; RVD-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -170,23 +170,23 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -199,17 +199,17 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v9, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v9, v9
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v9, v9, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v9, v9, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsrl.vi v8, v9, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
-; RVF-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse32.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -217,16 +217,16 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v9, v8, v9
 ; RVD-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v9, v10, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v9, v9, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v9, a1
 ; RVD-NEXT:    li a1, 32
-; RVD-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -250,40 +250,40 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v9, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -294,37 +294,37 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v9, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v9, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -334,19 +334,21 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v9, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v10, v9
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v9, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vwsubu.vx v10, v9, a1
 ; RVF-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RVF-NEXT:    vsrl.vi v8, v10, 23
+; RVF-NEXT:    vwsubu.vx v9, v8, a1
 ; RVF-NEXT:    li a1, 64
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RVF-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -354,18 +356,18 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v9, v8, v9
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v9, v9
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vsrl.vx v9, v9, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v9, v9, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v9, a1
 ; RVD-NEXT:    li a1, 64
-; RVD-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse64.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -392,13 +394,13 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -414,6 +416,7 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v10, v8, v10
 ; RVF-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -422,7 +425,6 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vnsrl.wi v12, v16, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVF-NEXT:    vnsrl.wi v10, v12, 0
-; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
 ; RVF-NEXT:    vsub.vx v8, v10, a1
 ; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -434,6 +436,7 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v10, v8, v10
 ; RVD-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -442,7 +445,6 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vnsrl.wi v12, v16, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVD-NEXT:    vnsrl.wi v10, v12, 0
-; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
 ; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -472,23 +474,23 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -500,15 +502,15 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
-; RVF-NEXT:    vrsub.vi v10, v8, 0
-; RVF-NEXT:    vand.vv v10, v8, v10
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVF-NEXT:    vnsrl.wi v10, v12, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v10, v10, a1
+; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vand.vv v8, v8, v10
+; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
+; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -516,15 +518,15 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
-; RVD-NEXT:    vrsub.vi v10, v8, 0
-; RVD-NEXT:    vand.vv v10, v8, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wi v10, v12, 23
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v10, v10, a1
+; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vand.vv v8, v8, v10
+; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
+; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
-; RVD-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -550,23 +552,23 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -579,17 +581,17 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v10, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v10, v10
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v10, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v10, v10, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsrl.vi v8, v10, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse32.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -597,16 +599,16 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v10, v8, v10
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v10, v12, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v10, v10, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 32
-; RVD-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -630,40 +632,40 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v10, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -674,37 +676,37 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v10, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v10, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -714,19 +716,21 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v10, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v12, v10
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v10, v12, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vwsubu.vx v12, v10, a1
 ; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RVF-NEXT:    vsrl.vi v8, v12, 23
+; RVF-NEXT:    vwsubu.vx v10, v8, a1
 ; RVF-NEXT:    li a1, 64
-; RVF-NEXT:    vmerge.vxm v8, v12, a1, v0
+; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -734,18 +738,18 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v10, v8, v10
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v10, v10
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vsrl.vx v10, v10, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v10, v10, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 64
-; RVD-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse64.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -771,13 +775,13 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -843,23 +847,23 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -914,23 +918,23 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -943,9 +947,9 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
 ; RVF-NEXT:    vsrl.vi v8, v8, 23
@@ -958,10 +962,10 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v9
 ; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v8, v10, a1
 ; RVD-NEXT:    li a1, 1023
 ; RVD-NEXT:    vsub.vx v8, v8, a1
@@ -987,40 +991,40 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v9, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1031,37 +1035,37 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v9, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v9, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1071,9 +1075,9 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v9, v8
 ; RVF-NEXT:    fsrm a1
@@ -1087,9 +1091,9 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v9
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
@@ -1121,13 +1125,13 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -1196,23 +1200,23 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -1267,23 +1271,23 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -1296,9 +1300,9 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
 ; RVF-NEXT:    vsrl.vi v8, v8, 23
@@ -1311,10 +1315,10 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v10
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v8, v12, a1
 ; RVD-NEXT:    li a1, 1023
 ; RVD-NEXT:    vsub.vx v8, v8, a1
@@ -1340,40 +1344,40 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v10, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1384,37 +1388,37 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v10, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v10, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1424,9 +1428,9 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v10, v8
 ; RVF-NEXT:    fsrm a1
@@ -1440,9 +1444,9 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v10
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 060a5c4224fe15..6a8d98d55289bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -10,30 +10,33 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vlm.v v8, (a0)
+; CHECK-NEXT:    li a0, -256
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v12, v9, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vi v13, v12, -16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v8, 2
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v12, v12, -15
+; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v9, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vadd.vv v11, v9, v9
-; CHECK-NEXT:    li a0, -256
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vadd.vi v9, v11, -16
-; CHECK-NEXT:    vrgather.vv v8, v10, v9, v0.t
-; CHECK-NEXT:    vmsne.vi v9, v8, 0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 8
-; CHECK-NEXT:    vadd.vi v11, v11, -15
-; CHECK-NEXT:    vrgather.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    vmv.v.v v0, v9
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vnsrl.wi v8, v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vrgather.vv v8, v14, v13, v0.t
+; CHECK-NEXT:    vnsrl.wi v13, v10, 8
+; CHECK-NEXT:    vmsne.vi v10, v8, 0
+; CHECK-NEXT:    vrgather.vv v13, v14, v12, v0.t
+; CHECK-NEXT:    vmsne.vi v8, v13, 0
+; CHECK-NEXT:    vmv.v.v v0, v10
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
   %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 04ebc7ca6b2b89..e13f4f4b50b0ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -32,20 +32,20 @@ define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    lw a6, 8(a0)
 ; RV32-NEXT:    lw a7, 12(a0)
-; RV32-NEXT:    lw t0, 8(a1)
-; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    lw t0, 12(a1)
+; RV32-NEXT:    lw a1, 8(a1)
 ; RV32-NEXT:    add a3, a5, a3
 ; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a1, a6, a1
 ; RV32-NEXT:    sltu a4, a2, a4
+; RV32-NEXT:    sltu a5, a1, a6
 ; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a1, a7, a1
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a4, t0, a6
-; RV32-NEXT:    add a1, a1, a4
+; RV32-NEXT:    add a5, a7, a5
 ; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw t0, 8(a0)
-; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    sw a1, 8(a0)
+; RV32-NEXT:    sw a5, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 493481ad129d29..e53876d69b59b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -325,20 +325,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 255
-; RV32-NEXT:    li a2, 128
-; RV32-NEXT:    addi a3, a0, 128
-; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vle8.v v16, (a3)
-; RV32-NEXT:    vle8.v v24, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    li a3, 128
+; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 128
+; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    vmseq.vi v0, v8, 0
+; RV32-NEXT:    vmv.v.i v24, 0
 ; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmseq.vi v0, v24, 0
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vse8.v v24, (a0)
+; RV32-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV32-NEXT:    vse8.v v16, (a2)
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    lbu a0, 0(a1)
@@ -356,20 +356,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 255
-; RV64-NEXT:    li a2, 128
-; RV64-NEXT:    addi a3, a0, 128
-; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vle8.v v16, (a3)
-; RV64-NEXT:    vle8.v v24, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    li a3, 128
+; RV64-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v8, (a0)
+; RV64-NEXT:    addi a0, a0, 128
+; RV64-NEXT:    vle8.v v16, (a0)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    vmseq.vi v0, v8, 0
+; RV64-NEXT:    vmv.v.i v24, 0
 ; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vse8.v v24, (a0)
+; RV64-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV64-NEXT:    vse8.v v16, (a2)
 ; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    lbu a0, 0(a1)
@@ -387,20 +387,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32ZBS-NEXT:    addi s0, sp, 384
 ; RV32ZBS-NEXT:    andi sp, sp, -128
 ; RV32ZBS-NEXT:    andi a1, a1, 255
-; RV32ZBS-NEXT:    li a2, 128
-; RV32ZBS-NEXT:    addi a3, a0, 128
-; RV32ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32ZBS-NEXT:    vle8.v v16, (a3)
-; RV32ZBS-NEXT:    vle8.v v24, (a0)
-; RV32ZBS-NEXT:    mv a0, sp
-; RV32ZBS-NEXT:    add a1, a0, a1
+; RV32ZBS-NEXT:    mv a2, sp
+; RV32ZBS-NEXT:    li a3, 128
+; RV32ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32ZBS-NEXT:    vle8.v v8, (a0)
+; RV32ZBS-NEXT:    addi a0, a0, 128
+; RV32ZBS-NEXT:    vle8.v v16, (a0)
+; RV32ZBS-NEXT:    add a1, a2, a1
+; RV32ZBS-NEXT:    vmseq.vi v0, v8, 0
+; RV32ZBS-NEXT:    vmv.v.i v24, 0
 ; RV32ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV32ZBS-NEXT:    vmseq.vi v0, v24, 0
-; RV32ZBS-NEXT:    vmv.v.i v16, 0
-; RV32ZBS-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32ZBS-NEXT:    vse8.v v24, (a0)
+; RV32ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV32ZBS-NEXT:    vse8.v v16, (a2)
 ; RV32ZBS-NEXT:    vmv1r.v v0, v8
-; RV32ZBS-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV32ZBS-NEXT:    addi a0, sp, 128
 ; RV32ZBS-NEXT:    vse8.v v8, (a0)
 ; RV32ZBS-NEXT:    lbu a0, 0(a1)
@@ -418,20 +418,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64ZBS-NEXT:    addi s0, sp, 384
 ; RV64ZBS-NEXT:    andi sp, sp, -128
 ; RV64ZBS-NEXT:    andi a1, a1, 255
-; RV64ZBS-NEXT:    li a2, 128
-; RV64ZBS-NEXT:    addi a3, a0, 128
-; RV64ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64ZBS-NEXT:    vle8.v v16, (a3)
-; RV64ZBS-NEXT:    vle8.v v24, (a0)
-; RV64ZBS-NEXT:    mv a0, sp
-; RV64ZBS-NEXT:    add a1, a0, a1
+; RV64ZBS-NEXT:    mv a2, sp
+; RV64ZBS-NEXT:    li a3, 128
+; RV64ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64ZBS-NEXT:    vle8.v v8, (a0)
+; RV64ZBS-NEXT:    addi a0, a0, 128
+; RV64ZBS-NEXT:    vle8.v v16, (a0)
+; RV64ZBS-NEXT:    add a1, a2, a1
+; RV64ZBS-NEXT:    vmseq.vi v0, v8, 0
+; RV64ZBS-NEXT:    vmv.v.i v24, 0
 ; RV64ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV64ZBS-NEXT:    vmseq.vi v0, v24, 0
-; RV64ZBS-NEXT:    vmv.v.i v16, 0
-; RV64ZBS-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64ZBS-NEXT:    vse8.v v24, (a0)
+; RV64ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV64ZBS-NEXT:    vse8.v v16, (a2)
 ; RV64ZBS-NEXT:    vmv1r.v v0, v8
-; RV64ZBS-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV64ZBS-NEXT:    addi a0, sp, 128
 ; RV64ZBS-NEXT:    vse8.v v8, (a0)
 ; RV64ZBS-NEXT:    lbu a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index fdee80fb95627e..e9dca2c42e835b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -607,9 +607,9 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; VLA-NEXT:    li a2, 64
 ; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    li a0, 42
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
-; VLA-NEXT:    li a0, 42
 ; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; VLA-NEXT:    vslidedown.vx v8, v8, a0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
@@ -764,8 +764,8 @@ define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
 ; VLA:       # %bb.0:
 ; VLA-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLA-NEXT:    li a1, 42
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; VLA-NEXT:    vslidedown.vx v8, v8, a1
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index cb830d668d2e8c..a193d4e4e689f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -177,8 +177,8 @@ define i64 @extractelt_v4i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vx v10, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v10
@@ -273,8 +273,8 @@ define i64 @extractelt_v3i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v8, v8, 5
+; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -426,12 +426,12 @@ define i64 @extractelt_v2i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -575,12 +575,12 @@ define i64 @extractelt_v4i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -686,14 +686,14 @@ define i64 @extractelt_v3i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    add a1, a1, a1
+; RV32-NEXT:    addi a0, a1, 1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    add a1, a1, a1
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v10, v8, a1
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    addi a1, a1, 1
-; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -827,19 +827,19 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 63
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    addi a3, a0, 128
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a3)
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle32.v v16, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    vadd.vv v16, v16, v16
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    vse32.v v8, (a2)
 ; RV32-NEXT:    vse32.v v16, (a0)
-; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    lw a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -384
 ; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
@@ -855,19 +855,19 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 63
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    addi a3, a0, 128
-; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a3)
+; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    li a3, 32
+; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle32.v v16, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vse32.v v8, (a2)
 ; RV64-NEXT:    vse32.v v16, (a0)
-; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    lw a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
@@ -931,14 +931,14 @@ define void @store_extractelt_v2i64(ptr %x, ptr %p) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sw a2, 0(a1)
-; RV32-NEXT:    sw a0, 4(a1)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    sw a0, 0(a1)
+; RV32-NEXT:    sw a2, 4(a1)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_extractelt_v2i64:
@@ -1062,17 +1062,17 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV32NOM-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV32NOM-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32NOM-NEXT:    vle32.v v9, (a0)
-; RV32NOM-NEXT:    vmulh.vv v9, v8, v9
 ; RV32NOM-NEXT:    lui a0, 1044480
 ; RV32NOM-NEXT:    vmv.s.x v10, a0
-; RV32NOM-NEXT:    vsext.vf4 v11, v10
-; RV32NOM-NEXT:    vand.vv v8, v8, v11
-; RV32NOM-NEXT:    vadd.vv v8, v9, v8
 ; RV32NOM-NEXT:    lui a0, 12320
 ; RV32NOM-NEXT:    addi a0, a0, 257
+; RV32NOM-NEXT:    vsext.vf4 v11, v10
+; RV32NOM-NEXT:    vand.vv v10, v8, v11
+; RV32NOM-NEXT:    vmulh.vv v8, v8, v9
 ; RV32NOM-NEXT:    vmv.s.x v9, a0
-; RV32NOM-NEXT:    vsext.vf4 v10, v9
-; RV32NOM-NEXT:    vsra.vv v9, v8, v10
+; RV32NOM-NEXT:    vsext.vf4 v11, v9
+; RV32NOM-NEXT:    vadd.vv v8, v8, v10
+; RV32NOM-NEXT:    vsra.vv v9, v8, v11
 ; RV32NOM-NEXT:    vsrl.vi v8, v8, 31
 ; RV32NOM-NEXT:    vadd.vv v8, v9, v8
 ; RV32NOM-NEXT:    vslidedown.vi v8, v8, 2
@@ -1083,10 +1083,10 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32M-NEXT:    vslidedown.vi v8, v8, 2
-; RV32M-NEXT:    vmv.x.s a0, v8
-; RV32M-NEXT:    lui a1, 322639
-; RV32M-NEXT:    addi a1, a1, -945
-; RV32M-NEXT:    mulh a0, a0, a1
+; RV32M-NEXT:    lui a0, 322639
+; RV32M-NEXT:    vmv.x.s a1, v8
+; RV32M-NEXT:    addi a0, a0, -945
+; RV32M-NEXT:    mulh a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 31
 ; RV32M-NEXT:    srai a0, a0, 2
 ; RV32M-NEXT:    add a0, a0, a1
@@ -1098,15 +1098,15 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV64NOM-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV64NOM-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64NOM-NEXT:    vle32.v v9, (a0)
-; RV64NOM-NEXT:    vmulh.vv v9, v8, v9
 ; RV64NOM-NEXT:    lui a0, 1044480
 ; RV64NOM-NEXT:    vmv.s.x v10, a0
-; RV64NOM-NEXT:    vsext.vf4 v11, v10
-; RV64NOM-NEXT:    vand.vv v8, v8, v11
-; RV64NOM-NEXT:    vadd.vv v8, v9, v8
 ; RV64NOM-NEXT:    lui a0, 12320
 ; RV64NOM-NEXT:    addi a0, a0, 257
+; RV64NOM-NEXT:    vsext.vf4 v11, v10
+; RV64NOM-NEXT:    vand.vv v10, v8, v11
+; RV64NOM-NEXT:    vmulh.vv v8, v8, v9
 ; RV64NOM-NEXT:    vmv.s.x v9, a0
+; RV64NOM-NEXT:    vadd.vv v8, v8, v10
 ; RV64NOM-NEXT:    vsext.vf4 v10, v9
 ; RV64NOM-NEXT:    vsra.vv v8, v8, v10
 ; RV64NOM-NEXT:    vsrl.vi v9, v8, 31
@@ -1119,10 +1119,10 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64M-NEXT:    vslidedown.vi v8, v8, 2
-; RV64M-NEXT:    vmv.x.s a0, v8
-; RV64M-NEXT:    lui a1, 322639
-; RV64M-NEXT:    addiw a1, a1, -945
-; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    lui a0, 322639
+; RV64M-NEXT:    vmv.x.s a1, v8
+; RV64M-NEXT:    addiw a0, a0, -945
+; RV64M-NEXT:    mul a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 63
 ; RV64M-NEXT:    srai a0, a0, 34
 ; RV64M-NEXT:    add a0, a0, a1
@@ -1149,10 +1149,10 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32M-NEXT:    vslidedown.vi v8, v8, 2
-; RV32M-NEXT:    vmv.x.s a0, v8
-; RV32M-NEXT:    lui a1, 322639
-; RV32M-NEXT:    addi a1, a1, -945
-; RV32M-NEXT:    mulhu a0, a0, a1
+; RV32M-NEXT:    lui a0, 322639
+; RV32M-NEXT:    vmv.x.s a1, v8
+; RV32M-NEXT:    addi a0, a0, -945
+; RV32M-NEXT:    mulhu a0, a1, a0
 ; RV32M-NEXT:    srli a0, a0, 2
 ; RV32M-NEXT:    ret
 ;
@@ -1172,11 +1172,11 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
 ; RV64M-LABEL: extractelt_udiv_v4i32:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    lui a0, 322639
-; RV64M-NEXT:    addi a0, a0, -945
-; RV64M-NEXT:    slli a0, a0, 32
 ; RV64M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64M-NEXT:    vslidedown.vi v8, v8, 2
+; RV64M-NEXT:    addi a0, a0, -945
 ; RV64M-NEXT:    vmv.x.s a1, v8
+; RV64M-NEXT:    slli a0, a0, 32
 ; RV64M-NEXT:    slli a1, a1, 32
 ; RV64M-NEXT:    mulhu a0, a1, a0
 ; RV64M-NEXT:    srli a0, a0, 34
@@ -1191,8 +1191,8 @@ define float @extractelt_fadd_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fadd.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1206,8 +1206,8 @@ define float @extractelt_fsub_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fsub.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
@@ -1221,8 +1221,8 @@ define float @extractelt_fmul_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fmul.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1236,8 +1236,8 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fdiv.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
index 84895715e814f9..ab2d00b9b9137c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
@@ -123,10 +123,10 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -148,10 +148,10 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -171,10 +171,10 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -194,10 +194,10 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -217,10 +217,10 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -240,10 +240,10 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
index 3c99870dba950c..c6ce7c1bbe8b4a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
@@ -123,10 +123,10 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -148,10 +148,10 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -171,10 +171,10 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -194,10 +194,10 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -217,10 +217,10 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -240,10 +240,10 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 49200fb7fe7faf..02e99ea513e69b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -72,10 +72,10 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -154,10 +154,10 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -238,10 +238,10 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -283,10 +283,10 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -324,10 +324,10 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 2
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 2
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index 77d70647da1bee..9a3838d57a0b07 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -63,11 +63,9 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -135,11 +133,9 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -209,11 +205,9 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -285,11 +279,9 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index ae592119cf8815..900e02876cbe1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -315,21 +315,19 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -356,14 +354,12 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index fc331f6c909cbf..4a7f888fbced4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -63,11 +63,9 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -135,11 +133,9 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -209,11 +205,9 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -285,11 +279,9 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index 8e042fc0785e19..db970c89d935c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -315,21 +315,19 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -356,14 +354,12 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 0b9fabb832e296..3a7ded1537ef66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -106,10 +106,10 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI4_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -132,10 +132,10 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -156,10 +156,10 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -180,10 +180,10 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -204,10 +204,10 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 10de74824548c1..127428f8d5a299 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -42,11 +42,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    li a0, 7
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vmul.vx v14, v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v14
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v14, -14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v12, v10, v8, v0.t
@@ -252,8 +252,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RV32-LABEL: splat_load_licm:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a1, 1
-; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    lui a2, 263168
+; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -266,8 +266,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RV64V-LABEL: splat_load_licm:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    lui a1, 1
-; RV64V-NEXT:    add a1, a0, a1
 ; RV64V-NEXT:    lui a2, 263168
+; RV64V-NEXT:    add a1, a0, a1
 ; RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
 ; RV64V-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -280,8 +280,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RVA22U64-LABEL: splat_load_licm:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    lui a1, 1
-; RVA22U64-NEXT:    add a1, a1, a0
 ; RVA22U64-NEXT:    lui a2, 263168
+; RVA22U64-NEXT:    add a1, a1, a0
 ; RVA22U64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -294,8 +294,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RV64ZVFHMIN-LABEL: splat_load_licm:
 ; RV64ZVFHMIN:       # %bb.0:
 ; RV64ZVFHMIN-NEXT:    lui a1, 1
-; RV64ZVFHMIN-NEXT:    add a1, a0, a1
 ; RV64ZVFHMIN-NEXT:    lui a2, 263168
+; RV64ZVFHMIN-NEXT:    add a1, a0, a1
 ; RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; RV64ZVFHMIN-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -593,22 +593,6 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV32-NEXT:    flw fs1, 52(s0)
 ; RV32-NEXT:    flw fs2, 56(s0)
 ; RV32-NEXT:    flw fs3, 60(s0)
-; RV32-NEXT:    fsw fs0, 112(sp)
-; RV32-NEXT:    fsw fs1, 116(sp)
-; RV32-NEXT:    fsw fs2, 120(sp)
-; RV32-NEXT:    fsw fs3, 124(sp)
-; RV32-NEXT:    fsw ft8, 96(sp)
-; RV32-NEXT:    fsw ft9, 100(sp)
-; RV32-NEXT:    fsw ft10, 104(sp)
-; RV32-NEXT:    fsw ft11, 108(sp)
-; RV32-NEXT:    fsw ft4, 80(sp)
-; RV32-NEXT:    fsw ft5, 84(sp)
-; RV32-NEXT:    fsw ft6, 88(sp)
-; RV32-NEXT:    fsw ft7, 92(sp)
-; RV32-NEXT:    fsw ft0, 64(sp)
-; RV32-NEXT:    fsw ft1, 68(sp)
-; RV32-NEXT:    fsw ft2, 72(sp)
-; RV32-NEXT:    fsw ft3, 76(sp)
 ; RV32-NEXT:    sw a4, 48(sp)
 ; RV32-NEXT:    sw a5, 52(sp)
 ; RV32-NEXT:    sw a6, 56(sp)
@@ -626,6 +610,22 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV32-NEXT:    fsw fa2, 8(sp)
 ; RV32-NEXT:    fsw fa3, 12(sp)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    fsw fs0, 112(sp)
+; RV32-NEXT:    fsw fs1, 116(sp)
+; RV32-NEXT:    fsw fs2, 120(sp)
+; RV32-NEXT:    fsw fs3, 124(sp)
+; RV32-NEXT:    fsw ft8, 96(sp)
+; RV32-NEXT:    fsw ft9, 100(sp)
+; RV32-NEXT:    fsw ft10, 104(sp)
+; RV32-NEXT:    fsw ft11, 108(sp)
+; RV32-NEXT:    fsw ft4, 80(sp)
+; RV32-NEXT:    fsw ft5, 84(sp)
+; RV32-NEXT:    fsw ft6, 88(sp)
+; RV32-NEXT:    fsw ft7, 92(sp)
+; RV32-NEXT:    fsw ft0, 64(sp)
+; RV32-NEXT:    fsw ft1, 68(sp)
+; RV32-NEXT:    fsw ft2, 72(sp)
+; RV32-NEXT:    fsw ft3, 76(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
@@ -682,22 +682,22 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    fmv.w.x ft0, a0
-; RV64-NEXT:    fmv.w.x ft1, a1
-; RV64-NEXT:    fmv.w.x ft2, a2
-; RV64-NEXT:    fmv.w.x ft3, a3
-; RV64-NEXT:    fmv.w.x ft4, a4
-; RV64-NEXT:    fmv.w.x ft5, a5
-; RV64-NEXT:    fmv.w.x ft6, a6
-; RV64-NEXT:    fmv.w.x ft7, a7
-; RV64-NEXT:    flw ft8, 0(s0)
-; RV64-NEXT:    flw ft9, 8(s0)
-; RV64-NEXT:    flw ft10, 16(s0)
-; RV64-NEXT:    flw ft11, 24(s0)
-; RV64-NEXT:    flw fs0, 32(s0)
-; RV64-NEXT:    flw fs1, 40(s0)
-; RV64-NEXT:    flw fs2, 48(s0)
-; RV64-NEXT:    flw fs3, 56(s0)
+; RV64-NEXT:    fmv.w.x ft4, a0
+; RV64-NEXT:    fmv.w.x ft5, a1
+; RV64-NEXT:    fmv.w.x ft6, a2
+; RV64-NEXT:    fmv.w.x ft7, a3
+; RV64-NEXT:    fmv.w.x fs0, a4
+; RV64-NEXT:    fmv.w.x fs1, a5
+; RV64-NEXT:    fmv.w.x fs2, a6
+; RV64-NEXT:    fmv.w.x fs3, a7
+; RV64-NEXT:    flw ft0, 0(s0)
+; RV64-NEXT:    flw ft1, 8(s0)
+; RV64-NEXT:    flw ft2, 16(s0)
+; RV64-NEXT:    flw ft3, 24(s0)
+; RV64-NEXT:    flw ft8, 32(s0)
+; RV64-NEXT:    flw ft9, 40(s0)
+; RV64-NEXT:    flw ft10, 48(s0)
+; RV64-NEXT:    flw ft11, 56(s0)
 ; RV64-NEXT:    flw fs4, 64(s0)
 ; RV64-NEXT:    flw fs5, 72(s0)
 ; RV64-NEXT:    flw fs6, 80(s0)
@@ -706,22 +706,6 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV64-NEXT:    flw fs9, 104(s0)
 ; RV64-NEXT:    flw fs10, 112(s0)
 ; RV64-NEXT:    flw fs11, 120(s0)
-; RV64-NEXT:    fsw fs8, 112(sp)
-; RV64-NEXT:    fsw fs9, 116(sp)
-; RV64-NEXT:    fsw fs10, 120(sp)
-; RV64-NEXT:    fsw fs11, 124(sp)
-; RV64-NEXT:    fsw fs4, 96(sp)
-; RV64-NEXT:    fsw fs5, 100(sp)
-; RV64-NEXT:    fsw fs6, 104(sp)
-; RV64-NEXT:    fsw fs7, 108(sp)
-; RV64-NEXT:    fsw fs0, 80(sp)
-; RV64-NEXT:    fsw fs1, 84(sp)
-; RV64-NEXT:    fsw fs2, 88(sp)
-; RV64-NEXT:    fsw fs3, 92(sp)
-; RV64-NEXT:    fsw ft8, 64(sp)
-; RV64-NEXT:    fsw ft9, 68(sp)
-; RV64-NEXT:    fsw ft10, 72(sp)
-; RV64-NEXT:    fsw ft11, 76(sp)
 ; RV64-NEXT:    fsw fa4, 16(sp)
 ; RV64-NEXT:    fsw fa5, 20(sp)
 ; RV64-NEXT:    fsw fa6, 24(sp)
@@ -730,15 +714,31 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV64-NEXT:    fsw fa1, 4(sp)
 ; RV64-NEXT:    fsw fa2, 8(sp)
 ; RV64-NEXT:    fsw fa3, 12(sp)
-; RV64-NEXT:    fsw ft4, 48(sp)
-; RV64-NEXT:    fsw ft5, 52(sp)
-; RV64-NEXT:    fsw ft6, 56(sp)
-; RV64-NEXT:    fsw ft7, 60(sp)
-; RV64-NEXT:    fsw ft0, 32(sp)
-; RV64-NEXT:    fsw ft1, 36(sp)
-; RV64-NEXT:    fsw ft2, 40(sp)
-; RV64-NEXT:    fsw ft3, 44(sp)
 ; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    fsw fs0, 48(sp)
+; RV64-NEXT:    fsw fs1, 52(sp)
+; RV64-NEXT:    fsw fs2, 56(sp)
+; RV64-NEXT:    fsw fs3, 60(sp)
+; RV64-NEXT:    fsw ft4, 32(sp)
+; RV64-NEXT:    fsw ft5, 36(sp)
+; RV64-NEXT:    fsw ft6, 40(sp)
+; RV64-NEXT:    fsw ft7, 44(sp)
+; RV64-NEXT:    fsw fs8, 112(sp)
+; RV64-NEXT:    fsw fs9, 116(sp)
+; RV64-NEXT:    fsw fs10, 120(sp)
+; RV64-NEXT:    fsw fs11, 124(sp)
+; RV64-NEXT:    fsw fs4, 96(sp)
+; RV64-NEXT:    fsw fs5, 100(sp)
+; RV64-NEXT:    fsw fs6, 104(sp)
+; RV64-NEXT:    fsw fs7, 108(sp)
+; RV64-NEXT:    fsw ft8, 80(sp)
+; RV64-NEXT:    fsw ft9, 84(sp)
+; RV64-NEXT:    fsw ft10, 88(sp)
+; RV64-NEXT:    fsw ft11, 92(sp)
+; RV64-NEXT:    fsw ft0, 64(sp)
+; RV64-NEXT:    fsw ft1, 68(sp)
+; RV64-NEXT:    fsw ft2, 72(sp)
+; RV64-NEXT:    fsw ft3, 76(sp)
 ; RV64-NEXT:    mv a1, sp
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
@@ -899,24 +899,24 @@ define <16 x double> @buildvec_v16f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    sw a0, 120(sp)
 ; RV32-NEXT:    sw a1, 124(sp)
-; RV32-NEXT:    fld ft0, 120(sp)
+; RV32-NEXT:    fld ft0, 0(s0)
+; RV32-NEXT:    fld ft1, 8(s0)
+; RV32-NEXT:    fld ft2, 16(s0)
+; RV32-NEXT:    fld ft3, 24(s0)
+; RV32-NEXT:    fld ft4, 120(sp)
 ; RV32-NEXT:    sw a2, 120(sp)
 ; RV32-NEXT:    sw a3, 124(sp)
-; RV32-NEXT:    fld ft1, 120(sp)
+; RV32-NEXT:    fld ft5, 120(sp)
 ; RV32-NEXT:    sw a4, 120(sp)
 ; RV32-NEXT:    sw a5, 124(sp)
-; RV32-NEXT:    fld ft2, 120(sp)
+; RV32-NEXT:    fld ft6, 120(sp)
 ; RV32-NEXT:    sw a6, 120(sp)
 ; RV32-NEXT:    sw a7, 124(sp)
-; RV32-NEXT:    fld ft3, 120(sp)
-; RV32-NEXT:    fld ft4, 0(s0)
-; RV32-NEXT:    fld ft5, 8(s0)
-; RV32-NEXT:    fld ft6, 16(s0)
-; RV32-NEXT:    fld ft7, 24(s0)
-; RV32-NEXT:    fsd ft4, 224(sp)
-; RV32-NEXT:    fsd ft5, 232(sp)
-; RV32-NEXT:    fsd ft6, 240(sp)
-; RV32-NEXT:    fsd ft7, 248(sp)
+; RV32-NEXT:    fld ft7, 120(sp)
+; RV32-NEXT:    fsd ft0, 224(sp)
+; RV32-NEXT:    fsd ft1, 232(sp)
+; RV32-NEXT:    fsd ft2, 240(sp)
+; RV32-NEXT:    fsd ft3, 248(sp)
 ; RV32-NEXT:    fsd fa4, 160(sp)
 ; RV32-NEXT:    fsd fa5, 168(sp)
 ; RV32-NEXT:    fsd fa6, 176(sp)
@@ -925,10 +925,10 @@ define <16 x double> @buildvec_v16f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    fsd fa1, 136(sp)
 ; RV32-NEXT:    fsd fa2, 144(sp)
 ; RV32-NEXT:    fsd fa3, 152(sp)
-; RV32-NEXT:    fsd ft0, 192(sp)
-; RV32-NEXT:    fsd ft1, 200(sp)
-; RV32-NEXT:    fsd ft2, 208(sp)
-; RV32-NEXT:    fsd ft3, 216(sp)
+; RV32-NEXT:    fsd ft4, 192(sp)
+; RV32-NEXT:    fsd ft5, 200(sp)
+; RV32-NEXT:    fsd ft6, 208(sp)
+; RV32-NEXT:    fsd ft7, 216(sp)
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
@@ -1038,56 +1038,58 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    sw a0, 120(sp)
 ; RV32-NEXT:    sw a1, 124(sp)
-; RV32-NEXT:    fld ft0, 120(sp)
+; RV32-NEXT:    fld ft0, 0(s0)
+; RV32-NEXT:    fld ft1, 8(s0)
+; RV32-NEXT:    fld ft2, 16(s0)
+; RV32-NEXT:    fld ft3, 24(s0)
+; RV32-NEXT:    fld ft4, 32(s0)
+; RV32-NEXT:    fld ft5, 40(s0)
+; RV32-NEXT:    fld ft6, 48(s0)
+; RV32-NEXT:    fld ft7, 56(s0)
+; RV32-NEXT:    fld ft8, 64(s0)
+; RV32-NEXT:    fld ft9, 72(s0)
+; RV32-NEXT:    fld ft10, 80(s0)
+; RV32-NEXT:    fld ft11, 88(s0)
+; RV32-NEXT:    fld fs0, 96(s0)
+; RV32-NEXT:    fld fs1, 104(s0)
+; RV32-NEXT:    fld fs2, 112(s0)
+; RV32-NEXT:    fld fs3, 120(s0)
+; RV32-NEXT:    fld fs4, 128(s0)
+; RV32-NEXT:    fld fs5, 136(s0)
+; RV32-NEXT:    fld fs6, 144(s0)
+; RV32-NEXT:    fld fs7, 152(s0)
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    addi a1, sp, 256
+; RV32-NEXT:    fld fs8, 120(sp)
 ; RV32-NEXT:    sw a2, 120(sp)
 ; RV32-NEXT:    sw a3, 124(sp)
-; RV32-NEXT:    fld ft1, 120(sp)
+; RV32-NEXT:    fld fs9, 120(sp)
 ; RV32-NEXT:    sw a4, 120(sp)
 ; RV32-NEXT:    sw a5, 124(sp)
-; RV32-NEXT:    fld ft2, 120(sp)
+; RV32-NEXT:    fld fs10, 120(sp)
 ; RV32-NEXT:    sw a6, 120(sp)
 ; RV32-NEXT:    sw a7, 124(sp)
-; RV32-NEXT:    fld ft3, 120(sp)
-; RV32-NEXT:    fld ft4, 0(s0)
-; RV32-NEXT:    fld ft5, 8(s0)
-; RV32-NEXT:    fld ft6, 16(s0)
-; RV32-NEXT:    fld ft7, 24(s0)
-; RV32-NEXT:    fld ft8, 32(s0)
-; RV32-NEXT:    fld ft9, 40(s0)
-; RV32-NEXT:    fld ft10, 48(s0)
-; RV32-NEXT:    fld ft11, 56(s0)
-; RV32-NEXT:    fld fs0, 64(s0)
-; RV32-NEXT:    fld fs1, 72(s0)
-; RV32-NEXT:    fld fs2, 80(s0)
-; RV32-NEXT:    fld fs3, 88(s0)
-; RV32-NEXT:    fld fs4, 96(s0)
-; RV32-NEXT:    fld fs5, 104(s0)
-; RV32-NEXT:    fld fs6, 112(s0)
-; RV32-NEXT:    fld fs7, 120(s0)
-; RV32-NEXT:    fld fs8, 128(s0)
-; RV32-NEXT:    fld fs9, 136(s0)
-; RV32-NEXT:    fld fs10, 144(s0)
-; RV32-NEXT:    fld fs11, 152(s0)
-; RV32-NEXT:    fsd fs8, 224(sp)
-; RV32-NEXT:    fsd fs9, 232(sp)
-; RV32-NEXT:    fsd fs10, 240(sp)
-; RV32-NEXT:    fsd fs11, 248(sp)
-; RV32-NEXT:    fsd fs4, 192(sp)
-; RV32-NEXT:    fsd fs5, 200(sp)
-; RV32-NEXT:    fsd fs6, 208(sp)
-; RV32-NEXT:    fsd fs7, 216(sp)
-; RV32-NEXT:    fsd fs0, 160(sp)
-; RV32-NEXT:    fsd fs1, 168(sp)
-; RV32-NEXT:    fsd fs2, 176(sp)
-; RV32-NEXT:    fsd fs3, 184(sp)
-; RV32-NEXT:    fsd ft8, 128(sp)
-; RV32-NEXT:    fsd ft9, 136(sp)
-; RV32-NEXT:    fsd ft10, 144(sp)
-; RV32-NEXT:    fsd ft11, 152(sp)
-; RV32-NEXT:    fsd ft4, 352(sp)
-; RV32-NEXT:    fsd ft5, 360(sp)
-; RV32-NEXT:    fsd ft6, 368(sp)
-; RV32-NEXT:    fsd ft7, 376(sp)
+; RV32-NEXT:    fld fs11, 120(sp)
+; RV32-NEXT:    fsd fs4, 224(sp)
+; RV32-NEXT:    fsd fs5, 232(sp)
+; RV32-NEXT:    fsd fs6, 240(sp)
+; RV32-NEXT:    fsd fs7, 248(sp)
+; RV32-NEXT:    fsd fs0, 192(sp)
+; RV32-NEXT:    fsd fs1, 200(sp)
+; RV32-NEXT:    fsd fs2, 208(sp)
+; RV32-NEXT:    fsd fs3, 216(sp)
+; RV32-NEXT:    fsd ft8, 160(sp)
+; RV32-NEXT:    fsd ft9, 168(sp)
+; RV32-NEXT:    fsd ft10, 176(sp)
+; RV32-NEXT:    fsd ft11, 184(sp)
+; RV32-NEXT:    fsd ft4, 128(sp)
+; RV32-NEXT:    fsd ft5, 136(sp)
+; RV32-NEXT:    fsd ft6, 144(sp)
+; RV32-NEXT:    fsd ft7, 152(sp)
+; RV32-NEXT:    fsd ft0, 352(sp)
+; RV32-NEXT:    fsd ft1, 360(sp)
+; RV32-NEXT:    fsd ft2, 368(sp)
+; RV32-NEXT:    fsd ft3, 376(sp)
 ; RV32-NEXT:    fsd fa4, 288(sp)
 ; RV32-NEXT:    fsd fa5, 296(sp)
 ; RV32-NEXT:    fsd fa6, 304(sp)
@@ -1096,15 +1098,13 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    fsd fa1, 264(sp)
 ; RV32-NEXT:    fsd fa2, 272(sp)
 ; RV32-NEXT:    fsd fa3, 280(sp)
-; RV32-NEXT:    fsd ft0, 320(sp)
-; RV32-NEXT:    fsd ft1, 328(sp)
-; RV32-NEXT:    fsd ft2, 336(sp)
-; RV32-NEXT:    fsd ft3, 344(sp)
-; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    fsd fs8, 320(sp)
+; RV32-NEXT:    fsd fs9, 328(sp)
+; RV32-NEXT:    fsd fs10, 336(sp)
+; RV32-NEXT:    fsd fs11, 344(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v16, (a0)
-; RV32-NEXT:    addi a0, sp, 256
-; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    vle64.v v8, (a1)
 ; RV32-NEXT:    addi sp, s0, -512
 ; RV32-NEXT:    .cfi_def_cfa sp, 512
 ; RV32-NEXT:    lw ra, 508(sp) # 4-byte Folded Reload
@@ -1190,6 +1190,8 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV64-NEXT:    fsd fa1, 136(sp)
 ; RV64-NEXT:    fsd fa2, 144(sp)
 ; RV64-NEXT:    fsd fa3, 152(sp)
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    mv a1, sp
 ; RV64-NEXT:    fsd fs0, 96(sp)
 ; RV64-NEXT:    fsd fs1, 104(sp)
 ; RV64-NEXT:    fsd fs2, 112(sp)
@@ -1206,11 +1208,9 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV64-NEXT:    fsd ft1, 8(sp)
 ; RV64-NEXT:    fsd ft2, 16(sp)
 ; RV64-NEXT:    fsd ft3, 24(sp)
-; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    vle64.v v16, (a0)
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    .cfi_def_cfa sp, 384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
@@ -1266,20 +1266,16 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7, double %e8, double %e9, double %e10, double %e11, double %e12, double %e13, double %e14, double %e15, double %e16, double %e17, double %e18, double %e19, double %e20, double %e21, double %e22, double %e23, double %e24, double %e25, double %e26, double %e27, double %e28, double %e29, double %e30, double %e31) vscale_range(2,2) {
 ; RV32-LABEL: buildvec_v32f64_exact_vlen:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -112
-; RV32-NEXT:    .cfi_def_cfa_offset 112
-; RV32-NEXT:    fsd fs0, 104(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs1, 96(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs2, 88(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs3, 80(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs4, 72(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs5, 64(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs6, 56(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs7, 48(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs8, 40(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs9, 32(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs10, 24(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs11, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    .cfi_def_cfa_offset 80
+; RV32-NEXT:    fsd fs0, 72(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs1, 64(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs2, 56(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs3, 48(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs4, 40(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs5, 32(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs6, 24(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs7, 16(sp) # 8-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset fs0, -8
 ; RV32-NEXT:    .cfi_offset fs1, -16
 ; RV32-NEXT:    .cfi_offset fs2, -24
@@ -1288,87 +1284,79 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV32-NEXT:    .cfi_offset fs5, -48
 ; RV32-NEXT:    .cfi_offset fs6, -56
 ; RV32-NEXT:    .cfi_offset fs7, -64
-; RV32-NEXT:    .cfi_offset fs8, -72
-; RV32-NEXT:    .cfi_offset fs9, -80
-; RV32-NEXT:    .cfi_offset fs10, -88
-; RV32-NEXT:    .cfi_offset fs11, -96
 ; RV32-NEXT:    sw a6, 8(sp)
 ; RV32-NEXT:    sw a7, 12(sp)
-; RV32-NEXT:    fld ft6, 8(sp)
+; RV32-NEXT:    fld ft0, 232(sp)
+; RV32-NEXT:    fld ft4, 224(sp)
+; RV32-NEXT:    fld ft1, 216(sp)
+; RV32-NEXT:    fld ft7, 208(sp)
+; RV32-NEXT:    fld ft2, 200(sp)
+; RV32-NEXT:    fld ft10, 192(sp)
+; RV32-NEXT:    fld ft3, 184(sp)
+; RV32-NEXT:    fld fs1, 176(sp)
+; RV32-NEXT:    fld ft5, 168(sp)
+; RV32-NEXT:    fld fs2, 160(sp)
+; RV32-NEXT:    fld ft6, 152(sp)
+; RV32-NEXT:    fld fs3, 144(sp)
+; RV32-NEXT:    fld ft8, 120(sp)
+; RV32-NEXT:    fld fs4, 112(sp)
+; RV32-NEXT:    fld ft9, 136(sp)
+; RV32-NEXT:    fld fs5, 128(sp)
+; RV32-NEXT:    fld ft11, 104(sp)
+; RV32-NEXT:    fld fs6, 96(sp)
+; RV32-NEXT:    fld fs0, 88(sp)
+; RV32-NEXT:    fld fs7, 80(sp)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vfmv.v.f v8, fa2
+; RV32-NEXT:    vfmv.v.f v10, fa0
+; RV32-NEXT:    vfmv.v.f v11, fa4
+; RV32-NEXT:    vfmv.v.f v12, fa6
+; RV32-NEXT:    fld fa4, 8(sp)
 ; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw a5, 12(sp)
-; RV32-NEXT:    fld ft7, 8(sp)
+; RV32-NEXT:    vfslide1down.vf v9, v8, fa3
+; RV32-NEXT:    vfslide1down.vf v8, v10, fa1
+; RV32-NEXT:    vfslide1down.vf v10, v11, fa5
+; RV32-NEXT:    vfslide1down.vf v11, v12, fa7
+; RV32-NEXT:    fld fa5, 8(sp)
 ; RV32-NEXT:    sw a2, 8(sp)
 ; RV32-NEXT:    sw a3, 12(sp)
-; RV32-NEXT:    fld ft8, 8(sp)
+; RV32-NEXT:    fld fa3, 8(sp)
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    fld ft9, 8(sp)
-; RV32-NEXT:    fld ft0, 264(sp)
-; RV32-NEXT:    fld ft1, 256(sp)
-; RV32-NEXT:    fld ft2, 248(sp)
-; RV32-NEXT:    fld ft3, 240(sp)
-; RV32-NEXT:    fld ft4, 232(sp)
-; RV32-NEXT:    fld ft5, 224(sp)
-; RV32-NEXT:    fld ft10, 216(sp)
-; RV32-NEXT:    fld ft11, 208(sp)
-; RV32-NEXT:    fld fs0, 200(sp)
-; RV32-NEXT:    fld fs1, 192(sp)
-; RV32-NEXT:    fld fs2, 184(sp)
-; RV32-NEXT:    fld fs3, 176(sp)
-; RV32-NEXT:    fld fs4, 152(sp)
-; RV32-NEXT:    fld fs5, 144(sp)
-; RV32-NEXT:    fld fs6, 168(sp)
-; RV32-NEXT:    fld fs7, 160(sp)
-; RV32-NEXT:    fld fs8, 136(sp)
-; RV32-NEXT:    fld fs9, 128(sp)
-; RV32-NEXT:    fld fs10, 120(sp)
-; RV32-NEXT:    fld fs11, 112(sp)
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vfmv.v.f v8, ft9
-; RV32-NEXT:    vfslide1down.vf v12, v8, ft8
-; RV32-NEXT:    vfmv.v.f v8, fa2
-; RV32-NEXT:    vfslide1down.vf v9, v8, fa3
-; RV32-NEXT:    vfmv.v.f v8, fa0
-; RV32-NEXT:    vfslide1down.vf v8, v8, fa1
-; RV32-NEXT:    vfmv.v.f v10, fa4
-; RV32-NEXT:    vfslide1down.vf v10, v10, fa5
-; RV32-NEXT:    vfmv.v.f v11, fa6
-; RV32-NEXT:    vfslide1down.vf v11, v11, fa7
-; RV32-NEXT:    vfmv.v.f v13, ft7
-; RV32-NEXT:    vfslide1down.vf v13, v13, ft6
-; RV32-NEXT:    vfmv.v.f v14, fs11
-; RV32-NEXT:    vfslide1down.vf v14, v14, fs10
-; RV32-NEXT:    vfmv.v.f v15, fs9
-; RV32-NEXT:    vfslide1down.vf v15, v15, fs8
-; RV32-NEXT:    vfmv.v.f v16, fs7
-; RV32-NEXT:    vfslide1down.vf v17, v16, fs6
+; RV32-NEXT:    fld fa2, 8(sp)
+; RV32-NEXT:    vfmv.v.f v12, fs7
+; RV32-NEXT:    vfmv.v.f v13, fs6
 ; RV32-NEXT:    vfmv.v.f v16, fs5
-; RV32-NEXT:    vfslide1down.vf v16, v16, fs4
-; RV32-NEXT:    vfmv.v.f v18, fs3
-; RV32-NEXT:    vfslide1down.vf v18, v18, fs2
-; RV32-NEXT:    vfmv.v.f v19, fs1
-; RV32-NEXT:    vfslide1down.vf v19, v19, fs0
-; RV32-NEXT:    vfmv.v.f v20, ft11
-; RV32-NEXT:    vfslide1down.vf v20, v20, ft10
-; RV32-NEXT:    vfmv.v.f v21, ft5
-; RV32-NEXT:    vfslide1down.vf v21, v21, ft4
-; RV32-NEXT:    vfmv.v.f v22, ft3
-; RV32-NEXT:    vfslide1down.vf v22, v22, ft2
-; RV32-NEXT:    vfmv.v.f v23, ft1
-; RV32-NEXT:    vfslide1down.vf v23, v23, ft0
-; RV32-NEXT:    fld fs0, 104(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs1, 96(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs2, 88(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs3, 80(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs4, 72(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs5, 64(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs6, 56(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs7, 48(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs8, 40(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs9, 32(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs10, 24(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs11, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT:    vfmv.v.f v18, fs4
+; RV32-NEXT:    vfmv.v.f v19, fs3
+; RV32-NEXT:    vfmv.v.f v20, fs2
+; RV32-NEXT:    vfmv.v.f v21, fs1
+; RV32-NEXT:    vfmv.v.f v22, ft10
+; RV32-NEXT:    vfmv.v.f v23, ft7
+; RV32-NEXT:    vfmv.v.f v24, ft4
+; RV32-NEXT:    vfslide1down.vf v14, v12, fs0
+; RV32-NEXT:    vfslide1down.vf v15, v13, ft11
+; RV32-NEXT:    vfslide1down.vf v17, v16, ft9
+; RV32-NEXT:    vfslide1down.vf v16, v18, ft8
+; RV32-NEXT:    vfslide1down.vf v18, v19, ft6
+; RV32-NEXT:    vfslide1down.vf v19, v20, ft5
+; RV32-NEXT:    vfslide1down.vf v20, v21, ft3
+; RV32-NEXT:    vfslide1down.vf v21, v22, ft2
+; RV32-NEXT:    vfslide1down.vf v22, v23, ft1
+; RV32-NEXT:    vfmv.v.f v12, fa5
+; RV32-NEXT:    vfslide1down.vf v13, v12, fa4
+; RV32-NEXT:    vfmv.v.f v12, fa2
+; RV32-NEXT:    vfslide1down.vf v12, v12, fa3
+; RV32-NEXT:    vfslide1down.vf v23, v24, ft0
+; RV32-NEXT:    fld fs0, 72(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs1, 64(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs2, 56(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs3, 48(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs4, 40(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs5, 32(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs6, 24(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs7, 16(sp) # 8-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore fs0
 ; RV32-NEXT:    .cfi_restore fs1
 ; RV32-NEXT:    .cfi_restore fs2
@@ -1377,11 +1365,7 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV32-NEXT:    .cfi_restore fs5
 ; RV32-NEXT:    .cfi_restore fs6
 ; RV32-NEXT:    .cfi_restore fs7
-; RV32-NEXT:    .cfi_restore fs8
-; RV32-NEXT:    .cfi_restore fs9
-; RV32-NEXT:    .cfi_restore fs10
-; RV32-NEXT:    .cfi_restore fs11
-; RV32-NEXT:    addi sp, sp, 112
+; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1405,59 +1389,59 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV64-NEXT:    .cfi_offset fs5, -48
 ; RV64-NEXT:    .cfi_offset fs6, -56
 ; RV64-NEXT:    .cfi_offset fs7, -64
-; RV64-NEXT:    fmv.d.x ft4, a7
-; RV64-NEXT:    fmv.d.x ft5, a5
-; RV64-NEXT:    fmv.d.x ft6, a3
-; RV64-NEXT:    fmv.d.x ft7, a1
+; RV64-NEXT:    fmv.d.x ft11, a7
+; RV64-NEXT:    fmv.d.x fs0, a5
+; RV64-NEXT:    fmv.d.x fs1, a3
+; RV64-NEXT:    fmv.d.x fs2, a1
 ; RV64-NEXT:    fld ft0, 184(sp)
-; RV64-NEXT:    fld ft1, 176(sp)
-; RV64-NEXT:    fld ft2, 168(sp)
-; RV64-NEXT:    fld ft3, 160(sp)
-; RV64-NEXT:    fld ft8, 152(sp)
+; RV64-NEXT:    fld ft3, 176(sp)
+; RV64-NEXT:    fld ft1, 168(sp)
+; RV64-NEXT:    fld ft6, 160(sp)
+; RV64-NEXT:    fld ft2, 152(sp)
 ; RV64-NEXT:    fld ft9, 144(sp)
-; RV64-NEXT:    fld ft10, 136(sp)
-; RV64-NEXT:    fld ft11, 128(sp)
-; RV64-NEXT:    fld fs0, 120(sp)
-; RV64-NEXT:    fld fs1, 112(sp)
-; RV64-NEXT:    fld fs2, 104(sp)
-; RV64-NEXT:    fld fs3, 96(sp)
-; RV64-NEXT:    fld fs4, 72(sp)
-; RV64-NEXT:    fld fs5, 64(sp)
-; RV64-NEXT:    fld fs6, 88(sp)
+; RV64-NEXT:    fld ft4, 136(sp)
+; RV64-NEXT:    fld fs3, 128(sp)
+; RV64-NEXT:    fld ft5, 120(sp)
+; RV64-NEXT:    fld fs4, 112(sp)
+; RV64-NEXT:    fld ft7, 104(sp)
+; RV64-NEXT:    fld fs5, 96(sp)
+; RV64-NEXT:    fld ft8, 72(sp)
+; RV64-NEXT:    fld fs6, 64(sp)
+; RV64-NEXT:    fld ft10, 88(sp)
 ; RV64-NEXT:    fld fs7, 80(sp)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vfmv.v.f v8, fa2
+; RV64-NEXT:    vfmv.v.f v10, fa0
+; RV64-NEXT:    vfmv.v.f v11, fa4
+; RV64-NEXT:    vfmv.v.f v12, fa6
+; RV64-NEXT:    vmv.v.x v13, a0
+; RV64-NEXT:    vmv.v.x v14, a2
+; RV64-NEXT:    vmv.v.x v15, a4
+; RV64-NEXT:    vmv.v.x v16, a6
 ; RV64-NEXT:    vfslide1down.vf v9, v8, fa3
-; RV64-NEXT:    vfmv.v.f v8, fa0
-; RV64-NEXT:    vfslide1down.vf v8, v8, fa1
-; RV64-NEXT:    vfmv.v.f v10, fa4
-; RV64-NEXT:    vfslide1down.vf v10, v10, fa5
-; RV64-NEXT:    vfmv.v.f v11, fa6
-; RV64-NEXT:    vfslide1down.vf v11, v11, fa7
-; RV64-NEXT:    vmv.v.x v12, a0
-; RV64-NEXT:    vfslide1down.vf v12, v12, ft7
-; RV64-NEXT:    vmv.v.x v13, a2
-; RV64-NEXT:    vfslide1down.vf v13, v13, ft6
-; RV64-NEXT:    vmv.v.x v14, a4
-; RV64-NEXT:    vfslide1down.vf v14, v14, ft5
-; RV64-NEXT:    vmv.v.x v15, a6
-; RV64-NEXT:    vfslide1down.vf v15, v15, ft4
+; RV64-NEXT:    vfslide1down.vf v8, v10, fa1
+; RV64-NEXT:    vfslide1down.vf v10, v11, fa5
+; RV64-NEXT:    vfslide1down.vf v11, v12, fa7
+; RV64-NEXT:    vfslide1down.vf v12, v13, fs2
+; RV64-NEXT:    vfslide1down.vf v13, v14, fs1
+; RV64-NEXT:    vfslide1down.vf v14, v15, fs0
+; RV64-NEXT:    vfslide1down.vf v15, v16, ft11
 ; RV64-NEXT:    vfmv.v.f v16, fs7
-; RV64-NEXT:    vfslide1down.vf v17, v16, fs6
-; RV64-NEXT:    vfmv.v.f v16, fs5
-; RV64-NEXT:    vfslide1down.vf v16, v16, fs4
-; RV64-NEXT:    vfmv.v.f v18, fs3
-; RV64-NEXT:    vfslide1down.vf v18, v18, fs2
-; RV64-NEXT:    vfmv.v.f v19, fs1
-; RV64-NEXT:    vfslide1down.vf v19, v19, fs0
-; RV64-NEXT:    vfmv.v.f v20, ft11
-; RV64-NEXT:    vfslide1down.vf v20, v20, ft10
-; RV64-NEXT:    vfmv.v.f v21, ft9
-; RV64-NEXT:    vfslide1down.vf v21, v21, ft8
-; RV64-NEXT:    vfmv.v.f v22, ft3
-; RV64-NEXT:    vfslide1down.vf v22, v22, ft2
-; RV64-NEXT:    vfmv.v.f v23, ft1
-; RV64-NEXT:    vfslide1down.vf v23, v23, ft0
+; RV64-NEXT:    vfmv.v.f v18, fs6
+; RV64-NEXT:    vfmv.v.f v19, fs5
+; RV64-NEXT:    vfmv.v.f v20, fs4
+; RV64-NEXT:    vfmv.v.f v21, fs3
+; RV64-NEXT:    vfmv.v.f v22, ft9
+; RV64-NEXT:    vfmv.v.f v23, ft6
+; RV64-NEXT:    vfmv.v.f v24, ft3
+; RV64-NEXT:    vfslide1down.vf v17, v16, ft10
+; RV64-NEXT:    vfslide1down.vf v16, v18, ft8
+; RV64-NEXT:    vfslide1down.vf v18, v19, ft7
+; RV64-NEXT:    vfslide1down.vf v19, v20, ft5
+; RV64-NEXT:    vfslide1down.vf v20, v21, ft4
+; RV64-NEXT:    vfslide1down.vf v21, v22, ft2
+; RV64-NEXT:    vfslide1down.vf v22, v23, ft1
+; RV64-NEXT:    vfslide1down.vf v23, v24, ft0
 ; RV64-NEXT:    fld fs0, 56(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    fld fs1, 48(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    fld fs2, 40(sp) # 8-byte Folded Reload
@@ -1752,15 +1736,15 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfmv.v.f v9, fa4
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
-; CHECK-NEXT:    vfslide1down.vf v9, v8, fa3
-; CHECK-NEXT:    vfmv.v.f v8, fa4
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
-; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa6
+; CHECK-NEXT:    vfslide1down.vf v10, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa7
+; CHECK-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x float> poison, float %e0, i64 0
   %v1 = insertelement <8 x float> %v0, float %e1, i64 1
@@ -1803,15 +1787,15 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfmv.v.f v9, fa4
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
-; CHECK-NEXT:    vfslide1down.vf v9, v8, fa3
-; CHECK-NEXT:    vfmv.v.f v8, fa4
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
-; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa6
+; CHECK-NEXT:    vfslide1down.vf v10, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa7
+; CHECK-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x double> poison, double %e0, i64 0
   %v1 = insertelement <8 x double> %v0, double %e1, i64 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 5ae47a01a37e15..2e558c425c377d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -40,13 +40,11 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; V128:       # %bb.0:
 ; V128-NEXT:    vmv1r.v v12, v9
 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; V128-NEXT:    vid.v v9
-; V128-NEXT:    vsrl.vi v14, v9, 1
-; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; V128-NEXT:    vrgatherei16.vv v10, v8, v14
-; V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; V128-NEXT:    vid.v v10
 ; V128-NEXT:    vmv.v.i v0, 10
-; V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; V128-NEXT:    vsrl.vi v14, v10, 1
+; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; V128-NEXT:    vrgatherei16.vv v10, v8, v14
 ; V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
@@ -244,26 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-NEXT:    slli a0, a0, 3
 ; V128-NEXT:    sub sp, sp, a0
 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT:    vmv8r.v v0, v16
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v24, v16
 ; V128-NEXT:    vmv8r.v v16, v8
+; V128-NEXT:    vmv8r.v v8, v24
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v8, v0, 16
-; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    vslidedown.vi v0, v24, 16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v24, v8, v0
+; V128-NEXT:    vwmaccu.vx v24, a0, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; V128-NEXT:    vslidedown.vi v0, v16, 16
+; V128-NEXT:    lui a1, 699051
+; V128-NEXT:    li a2, 32
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v8, v0, v16
-; V128-NEXT:    vwmaccu.vx v8, a0, v16
-; V128-NEXT:    lui a1, 699051
 ; V128-NEXT:    addi a1, a1, -1366
 ; V128-NEXT:    vmv.s.x v0, a1
-; V128-NEXT:    li a1, 32
-; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; V128-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; V128-NEXT:    addi a1, sp, 16
 ; V128-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
index 31e2d75e514b41..c14eae0b1de61a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
@@ -558,13 +558,11 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v8, v8
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFHMIN-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -610,13 +608,11 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v8, v8
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFHMIN-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -1195,13 +1191,13 @@ define void @fcmp_ord_vf_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1249,13 +1245,13 @@ define void @fcmp_uno_vf_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1836,13 +1832,13 @@ define void @fcmp_ord_fv_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1890,13 +1886,13 @@ define void @fcmp_uno_fv_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index c24ade1e6d8eff..36bbec12e9b06c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -901,11 +901,11 @@ define void @copysign_vf_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    fmv.x.w a1, fa0
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a2, 8
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    addi a1, a2, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -923,12 +923,12 @@ define void @copysign_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    fmv.x.w a1, fa0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a2, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    addi a1, a2, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -955,11 +955,11 @@ define void @copysign_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    addi a2, a1, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a2, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
@@ -985,12 +985,12 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    addi a2, a1, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a2, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
@@ -1042,8 +1042,8 @@ define void @copysign_neg_v8bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vor.vv v8, v9, v8
@@ -1064,9 +1064,9 @@ define void @copysign_neg_v6bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a2, a1, -1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    addi a2, a1, -1
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
@@ -1097,8 +1097,8 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
@@ -1128,9 +1128,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
@@ -1602,11 +1602,11 @@ define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -1630,11 +1630,11 @@ define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -1667,11 +1667,11 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v10, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -1705,11 +1705,11 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3717,14 +3717,14 @@ define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a2
 ; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vmv.v.x v10, a2
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3746,15 +3746,15 @@ define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3785,14 +3785,14 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vmv.v.x v10, a2
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3823,15 +3823,15 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3929,11 +3929,11 @@ define void @trunc_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -3954,12 +3954,12 @@ define void @trunc_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -3995,11 +3995,11 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4036,12 +4036,12 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4062,9 +4062,9 @@ define void @trunc_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4104,11 +4104,11 @@ define void @ceil_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4131,12 +4131,12 @@ define void @ceil_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4176,11 +4176,11 @@ define void @ceil_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4221,12 +4221,12 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4249,9 +4249,9 @@ define void @ceil_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4295,11 +4295,11 @@ define void @floor_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4322,12 +4322,12 @@ define void @floor_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4367,11 +4367,11 @@ define void @floor_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4412,12 +4412,12 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4440,9 +4440,9 @@ define void @floor_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4486,11 +4486,11 @@ define void @round_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4513,12 +4513,12 @@ define void @round_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4558,11 +4558,11 @@ define void @round_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4603,12 +4603,12 @@ define void @round_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4631,9 +4631,9 @@ define void @round_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4677,11 +4677,11 @@ define void @rint_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4717,11 +4717,11 @@ define void @rint_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4742,9 +4742,9 @@ define void @rint_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4784,11 +4784,11 @@ define void @nearbyint_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4828,11 +4828,11 @@ define void @nearbyint_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4855,9 +4855,9 @@ define void @nearbyint_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index da0bc5983a1251..7f4483a8f77d9c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -124,21 +124,21 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH32:       # %bb.0:
 ; ZVFH32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFH32-NEXT:    lui a1, 8
 ; ZVFH32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT:    vmv.x.s a1, v8
 ; ZVFH32-NEXT:    vmv.x.s a2, v9
-; ZVFH32-NEXT:    lui a3, 8
-; ZVFH32-NEXT:    addi a3, a3, -1
-; ZVFH32-NEXT:    and a2, a2, a3
-; ZVFH32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFH32-NEXT:    vmv.x.s a4, v8
-; ZVFH32-NEXT:    and a3, a4, a3
-; ZVFH32-NEXT:    slli a4, a1, 17
+; ZVFH32-NEXT:    addi a1, a1, -1
+; ZVFH32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFH32-NEXT:    vmv.x.s a3, v8
+; ZVFH32-NEXT:    and a2, a2, a1
+; ZVFH32-NEXT:    vmv.x.s a4, v9
+; ZVFH32-NEXT:    and a1, a4, a1
+; ZVFH32-NEXT:    slli a4, a3, 17
+; ZVFH32-NEXT:    slli a3, a3, 30
 ; ZVFH32-NEXT:    srli a4, a4, 19
-; ZVFH32-NEXT:    slli a3, a3, 15
-; ZVFH32-NEXT:    slli a1, a1, 30
+; ZVFH32-NEXT:    slli a1, a1, 15
+; ZVFH32-NEXT:    or a2, a2, a3
 ; ZVFH32-NEXT:    or a1, a2, a1
-; ZVFH32-NEXT:    or a1, a1, a3
 ; ZVFH32-NEXT:    sw a1, 0(a0)
 ; ZVFH32-NEXT:    sh a4, 4(a0)
 ; ZVFH32-NEXT:    ret
@@ -147,19 +147,19 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH64:       # %bb.0:
 ; ZVFH64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT:    vmv.x.s a1, v9
-; ZVFH64-NEXT:    lui a2, 8
-; ZVFH64-NEXT:    addiw a2, a2, -1
-; ZVFH64-NEXT:    and a1, a1, a2
+; ZVFH64-NEXT:    lui a1, 8
+; ZVFH64-NEXT:    vmv.x.s a2, v9
+; ZVFH64-NEXT:    addiw a1, a1, -1
 ; ZVFH64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFH64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFH64-NEXT:    and a2, a2, a1
 ; ZVFH64-NEXT:    vmv.x.s a3, v8
-; ZVFH64-NEXT:    and a2, a3, a2
-; ZVFH64-NEXT:    slli a2, a2, 15
-; ZVFH64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH64-NEXT:    vmv.x.s a3, v8
+; ZVFH64-NEXT:    and a1, a3, a1
+; ZVFH64-NEXT:    vmv.x.s a3, v9
 ; ZVFH64-NEXT:    slli a3, a3, 30
-; ZVFH64-NEXT:    or a1, a1, a3
-; ZVFH64-NEXT:    or a1, a1, a2
+; ZVFH64-NEXT:    slli a1, a1, 15
+; ZVFH64-NEXT:    or a2, a2, a3
+; ZVFH64-NEXT:    or a1, a2, a1
 ; ZVFH64-NEXT:    slli a2, a1, 19
 ; ZVFH64-NEXT:    srli a2, a2, 51
 ; ZVFH64-NEXT:    sw a1, 0(a0)
@@ -170,21 +170,21 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFHMIN32-NEXT:    lui a1, 8
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN32-NEXT:    lui a3, 8
-; ZVFHMIN32-NEXT:    addi a3, a3, -1
-; ZVFHMIN32-NEXT:    and a2, a2, a3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    and a3, a4, a3
-; ZVFHMIN32-NEXT:    slli a4, a1, 17
+; ZVFHMIN32-NEXT:    addi a1, a1, -1
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    and a2, a2, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN32-NEXT:    and a1, a4, a1
+; ZVFHMIN32-NEXT:    slli a4, a3, 17
+; ZVFHMIN32-NEXT:    slli a3, a3, 30
 ; ZVFHMIN32-NEXT:    srli a4, a4, 19
-; ZVFHMIN32-NEXT:    slli a3, a3, 15
-; ZVFHMIN32-NEXT:    slli a1, a1, 30
+; ZVFHMIN32-NEXT:    slli a1, a1, 15
+; ZVFHMIN32-NEXT:    or a2, a2, a3
 ; ZVFHMIN32-NEXT:    or a1, a2, a1
-; ZVFHMIN32-NEXT:    or a1, a1, a3
 ; ZVFHMIN32-NEXT:    sw a1, 0(a0)
 ; ZVFHMIN32-NEXT:    sh a4, 4(a0)
 ; ZVFHMIN32-NEXT:    ret
@@ -193,19 +193,19 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v9
-; ZVFHMIN64-NEXT:    lui a2, 8
-; ZVFHMIN64-NEXT:    addiw a2, a2, -1
-; ZVFHMIN64-NEXT:    and a1, a1, a2
+; ZVFHMIN64-NEXT:    lui a1, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v9
+; ZVFHMIN64-NEXT:    addiw a1, a1, -1
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFHMIN64-NEXT:    and a2, a2, a1
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    and a2, a3, a2
-; ZVFHMIN64-NEXT:    slli a2, a2, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    and a1, a3, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
 ; ZVFHMIN64-NEXT:    slli a3, a3, 30
-; ZVFHMIN64-NEXT:    or a1, a1, a3
-; ZVFHMIN64-NEXT:    or a1, a1, a2
+; ZVFHMIN64-NEXT:    slli a1, a1, 15
+; ZVFHMIN64-NEXT:    or a2, a2, a3
+; ZVFHMIN64-NEXT:    or a1, a2, a1
 ; ZVFHMIN64-NEXT:    slli a2, a1, 19
 ; ZVFHMIN64-NEXT:    srli a2, a2, 51
 ; ZVFHMIN64-NEXT:    sw a1, 0(a0)
@@ -221,21 +221,21 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH32:       # %bb.0:
 ; ZVFH32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFH32-NEXT:    lui a1, 16
 ; ZVFH32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT:    vmv.x.s a1, v8
 ; ZVFH32-NEXT:    vmv.x.s a2, v9
-; ZVFH32-NEXT:    lui a3, 16
-; ZVFH32-NEXT:    addi a3, a3, -1
-; ZVFH32-NEXT:    and a2, a2, a3
-; ZVFH32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFH32-NEXT:    vmv.x.s a4, v8
-; ZVFH32-NEXT:    and a3, a4, a3
-; ZVFH32-NEXT:    slli a4, a1, 17
+; ZVFH32-NEXT:    addi a1, a1, -1
+; ZVFH32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFH32-NEXT:    vmv.x.s a3, v8
+; ZVFH32-NEXT:    and a2, a2, a1
+; ZVFH32-NEXT:    vmv.x.s a4, v9
+; ZVFH32-NEXT:    and a1, a4, a1
+; ZVFH32-NEXT:    slli a4, a3, 17
+; ZVFH32-NEXT:    slli a3, a3, 30
 ; ZVFH32-NEXT:    srli a4, a4, 19
-; ZVFH32-NEXT:    slli a3, a3, 15
-; ZVFH32-NEXT:    slli a1, a1, 30
+; ZVFH32-NEXT:    slli a1, a1, 15
+; ZVFH32-NEXT:    or a2, a2, a3
 ; ZVFH32-NEXT:    or a1, a2, a1
-; ZVFH32-NEXT:    or a1, a1, a3
 ; ZVFH32-NEXT:    sw a1, 0(a0)
 ; ZVFH32-NEXT:    sh a4, 4(a0)
 ; ZVFH32-NEXT:    ret
@@ -244,19 +244,19 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH64:       # %bb.0:
 ; ZVFH64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT:    vmv.x.s a1, v9
-; ZVFH64-NEXT:    lui a2, 16
-; ZVFH64-NEXT:    addiw a2, a2, -1
-; ZVFH64-NEXT:    and a1, a1, a2
+; ZVFH64-NEXT:    lui a1, 16
+; ZVFH64-NEXT:    vmv.x.s a2, v9
+; ZVFH64-NEXT:    addiw a1, a1, -1
 ; ZVFH64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFH64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFH64-NEXT:    and a2, a2, a1
 ; ZVFH64-NEXT:    vmv.x.s a3, v8
-; ZVFH64-NEXT:    and a2, a3, a2
-; ZVFH64-NEXT:    slli a2, a2, 15
-; ZVFH64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH64-NEXT:    vmv.x.s a3, v8
+; ZVFH64-NEXT:    and a1, a3, a1
+; ZVFH64-NEXT:    vmv.x.s a3, v9
 ; ZVFH64-NEXT:    slli a3, a3, 30
-; ZVFH64-NEXT:    or a1, a1, a3
-; ZVFH64-NEXT:    or a1, a1, a2
+; ZVFH64-NEXT:    slli a1, a1, 15
+; ZVFH64-NEXT:    or a2, a2, a3
+; ZVFH64-NEXT:    or a1, a2, a1
 ; ZVFH64-NEXT:    slli a2, a1, 19
 ; ZVFH64-NEXT:    srli a2, a2, 51
 ; ZVFH64-NEXT:    sw a1, 0(a0)
@@ -267,21 +267,21 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFHMIN32-NEXT:    lui a1, 16
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN32-NEXT:    lui a3, 16
-; ZVFHMIN32-NEXT:    addi a3, a3, -1
-; ZVFHMIN32-NEXT:    and a2, a2, a3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    and a3, a4, a3
-; ZVFHMIN32-NEXT:    slli a4, a1, 17
+; ZVFHMIN32-NEXT:    addi a1, a1, -1
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    and a2, a2, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN32-NEXT:    and a1, a4, a1
+; ZVFHMIN32-NEXT:    slli a4, a3, 17
+; ZVFHMIN32-NEXT:    slli a3, a3, 30
 ; ZVFHMIN32-NEXT:    srli a4, a4, 19
-; ZVFHMIN32-NEXT:    slli a3, a3, 15
-; ZVFHMIN32-NEXT:    slli a1, a1, 30
+; ZVFHMIN32-NEXT:    slli a1, a1, 15
+; ZVFHMIN32-NEXT:    or a2, a2, a3
 ; ZVFHMIN32-NEXT:    or a1, a2, a1
-; ZVFHMIN32-NEXT:    or a1, a1, a3
 ; ZVFHMIN32-NEXT:    sw a1, 0(a0)
 ; ZVFHMIN32-NEXT:    sh a4, 4(a0)
 ; ZVFHMIN32-NEXT:    ret
@@ -290,19 +290,19 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v9
-; ZVFHMIN64-NEXT:    lui a2, 16
-; ZVFHMIN64-NEXT:    addiw a2, a2, -1
-; ZVFHMIN64-NEXT:    and a1, a1, a2
+; ZVFHMIN64-NEXT:    lui a1, 16
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v9
+; ZVFHMIN64-NEXT:    addiw a1, a1, -1
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFHMIN64-NEXT:    and a2, a2, a1
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    and a2, a3, a2
-; ZVFHMIN64-NEXT:    slli a2, a2, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    and a1, a3, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
 ; ZVFHMIN64-NEXT:    slli a3, a3, 30
-; ZVFHMIN64-NEXT:    or a1, a1, a3
-; ZVFHMIN64-NEXT:    or a1, a1, a2
+; ZVFHMIN64-NEXT:    slli a1, a1, 15
+; ZVFHMIN64-NEXT:    or a2, a2, a3
+; ZVFHMIN64-NEXT:    or a1, a2, a1
 ; ZVFHMIN64-NEXT:    slli a2, a1, 19
 ; ZVFHMIN64-NEXT:    srli a2, a2, 51
 ; ZVFHMIN64-NEXT:    sw a1, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
index e64c7c87132eee..abe145bf6ea287 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll
@@ -112,11 +112,11 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v24, v16, v0.t
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v24, 16
 ; CHECK-NEXT:    ret
   %v = call <32 x float> @llvm.vp.fptrunc.v32f64.v32f32(<32 x double> %a, <32 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index 131fa53b359998..be32c033fe3738 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
@@ -125,10 +125,10 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: round_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -150,10 +150,10 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -173,10 +173,10 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -196,10 +196,10 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -219,10 +219,10 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -242,10 +242,10 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
index b21be367f8ef5e..774ce5c7859c9d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
@@ -30,9 +30,9 @@ define <1 x half> @round_v1f16(<1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -69,9 +69,9 @@ define <2 x half> @round_v2f16(<2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -108,9 +108,9 @@ define <4 x half> @round_v4f16(<4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -147,9 +147,9 @@ define <8 x half> @round_v8f16(<8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -186,9 +186,9 @@ define <16 x half> @round_v16f16(<16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -225,12 +225,12 @@ define <32 x half> @round_v32f16(<32 x half> %x) {
 ; ZVFHMIN-LABEL: round_v32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a0, 32
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
index 37f2e59ad7516d..5c0279e133dfaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
@@ -125,10 +125,10 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -150,10 +150,10 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -173,10 +173,10 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -196,10 +196,10 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -219,10 +219,10 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -242,10 +242,10 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
index 13d62bb24441c9..0b6baad127643a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
@@ -30,9 +30,9 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -69,9 +69,9 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -108,9 +108,9 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -147,9 +147,9 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -186,9 +186,9 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -225,12 +225,12 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) {
 ; ZVFHMIN-LABEL: roundeven_v32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a0, 32
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
index b911722368ce3a..2173887e854178 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
@@ -113,10 +113,10 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -136,10 +136,10 @@ define <1 x float> @trunc_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -157,10 +157,10 @@ define <2 x float> @trunc_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -178,10 +178,10 @@ define <4 x float> @trunc_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -199,10 +199,10 @@ define <8 x float> @trunc_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
@@ -220,10 +220,10 @@ define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
index f7737784d4ca57..986636d974acaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
@@ -19,8 +19,8 @@ define <1 x i1> @insertelt_idx_v1i1(<1 x i1> %x, i1 %elt, i32 zeroext %idx) noun
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -52,8 +52,8 @@ define <2 x i1> @insertelt_idx_v2i1(<2 x i1> %x, i1 %elt, i32 zeroext %idx) noun
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -87,8 +87,8 @@ define <8 x i1> @insertelt_idx_v8i1(<8 x i1> %x, i1 %elt, i32 zeroext %idx) noun
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index d31579e45683b7..b3860795a557b5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -755,15 +755,15 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32VLA-NEXT:    vle64.v v8, (a0)
 ; RV32VLA-NEXT:    addi a0, sp, 128
+; RV32VLA-NEXT:    csrr a2, vlenb
+; RV32VLA-NEXT:    addi a3, sp, 64
+; RV32VLA-NEXT:    slli a2, a2, 3
 ; RV32VLA-NEXT:    vse64.v v8, (a0)
-; RV32VLA-NEXT:    csrr a0, vlenb
-; RV32VLA-NEXT:    slli a0, a0, 3
-; RV32VLA-NEXT:    addi a2, sp, 64
-; RV32VLA-NEXT:    add a3, a2, a0
-; RV32VLA-NEXT:    vl8re64.v v8, (a3)
-; RV32VLA-NEXT:    vl8re64.v v16, (a2)
-; RV32VLA-NEXT:    add a0, a1, a0
-; RV32VLA-NEXT:    vs8r.v v8, (a0)
+; RV32VLA-NEXT:    add a0, a3, a2
+; RV32VLA-NEXT:    vl8re64.v v8, (a0)
+; RV32VLA-NEXT:    vl8re64.v v16, (a3)
+; RV32VLA-NEXT:    add a2, a1, a2
+; RV32VLA-NEXT:    vs8r.v v8, (a2)
 ; RV32VLA-NEXT:    vs8r.v v16, (a1)
 ; RV32VLA-NEXT:    addi sp, s0, -80
 ; RV32VLA-NEXT:    .cfi_def_cfa sp, 80
@@ -792,15 +792,15 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64VLA-NEXT:    vle64.v v8, (a0)
 ; RV64VLA-NEXT:    addi a0, sp, 128
+; RV64VLA-NEXT:    csrr a2, vlenb
+; RV64VLA-NEXT:    addi a3, sp, 64
+; RV64VLA-NEXT:    slli a2, a2, 3
 ; RV64VLA-NEXT:    vse64.v v8, (a0)
-; RV64VLA-NEXT:    csrr a0, vlenb
-; RV64VLA-NEXT:    slli a0, a0, 3
-; RV64VLA-NEXT:    addi a2, sp, 64
-; RV64VLA-NEXT:    add a3, a2, a0
-; RV64VLA-NEXT:    vl8re64.v v8, (a3)
-; RV64VLA-NEXT:    vl8re64.v v16, (a2)
-; RV64VLA-NEXT:    add a0, a1, a0
-; RV64VLA-NEXT:    vs8r.v v8, (a0)
+; RV64VLA-NEXT:    add a0, a3, a2
+; RV64VLA-NEXT:    vl8re64.v v8, (a0)
+; RV64VLA-NEXT:    vl8re64.v v16, (a3)
+; RV64VLA-NEXT:    add a2, a1, a2
+; RV64VLA-NEXT:    vs8r.v v8, (a2)
 ; RV64VLA-NEXT:    vs8r.v v16, (a1)
 ; RV64VLA-NEXT:    addi sp, s0, -80
 ; RV64VLA-NEXT:    .cfi_def_cfa sp, 80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 0ff3641483ddbc..6782b2003ba94b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -122,14 +122,14 @@ define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx)
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 63
-; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    addi a3, sp, 128
 ; RV32-NEXT:    li a4, 32
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; RV32-NEXT:    vse32.v v16, (a3)
 ; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    sw a0, 0(a1)
 ; RV32-NEXT:    vle32.v v8, (a2)
 ; RV32-NEXT:    vle32.v v16, (a3)
@@ -155,14 +155,14 @@ define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx)
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 63
-; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    mv a2, sp
-; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    addi a3, sp, 128
 ; RV64-NEXT:    li a4, 32
+; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v16, (a3)
 ; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    sw a0, 0(a1)
 ; RV64-NEXT:    vle32.v v8, (a2)
 ; RV64-NEXT:    vle32.v v16, (a3)
@@ -228,17 +228,17 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 3
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vmv.x.s a3, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a4, v9
+; RV32-NEXT:    vmv.x.s a4, v10
 ; RV32-NEXT:    vmv.x.s a5, v8
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a5
+; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslidedown.vi v8, v8, 2
@@ -248,11 +248,11 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vmv.x.s a2, v8
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    vmv.x.s a2, v9
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:    ret
@@ -313,9 +313,9 @@ define <32 x i16> @insertelt_v32i16(<32 x i16> %a, i16 %y, i32 %idx) {
 ; RV64-LABEL: insertelt_v32i16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    slli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v12, a0
-; RV64-NEXT:    slli a1, a1, 32
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    addi a0, a1, 1
 ; RV64-NEXT:    vsetvli zero, a0, e16, m4, tu, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index aaed16097d15c2..0b4ced10b9cc46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -697,12 +697,11 @@ define void @buildvec_seq_v9i8(ptr %x) {
 ; CHECK-LABEL: buildvec_seq_v9i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 73
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    li a1, 146
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    li a1, 146
 ; CHECK-NEXT:    vmv.s.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
@@ -728,49 +727,27 @@ define void @buildvec_seq_v4i16_v2i32(ptr %x) {
 }
 
 define void @buildvec_vid_step1o2_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3, ptr %z4, ptr %z5, ptr %z6) {
-; RV32-LABEL: buildvec_vid_step1o2_v4i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vid.v v8
-; RV32-NEXT:    vsrl.vi v8, v8, 1
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    vmv.v.i v9, 1
-; RV32-NEXT:    vse32.v v8, (a2)
-; RV32-NEXT:    vse32.v v8, (a3)
-; RV32-NEXT:    vse32.v v8, (a4)
-; RV32-NEXT:    vmv.s.x v8, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV32-NEXT:    vslideup.vi v9, v8, 1
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v9, (a5)
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vse32.v v8, (a6)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: buildvec_vid_step1o2_v4i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vsrl.vi v8, v8, 1
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    vmv.v.i v9, 1
-; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    vse32.v v8, (a2)
-; RV64-NEXT:    vse32.v v8, (a3)
-; RV64-NEXT:    vse32.v v8, (a4)
-; RV64-NEXT:    vmv.s.x v8, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV64-NEXT:    vslideup.vi v9, v8, 1
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v9, (a5)
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vse32.v v8, (a6)
-; RV64-NEXT:    ret
+; CHECK-LABEL: buildvec_vid_step1o2_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 1
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsrl.vi v9, v9, 1
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    vse32.v v9, (a1)
+; CHECK-NEXT:    vse32.v v9, (a2)
+; CHECK-NEXT:    vse32.v v9, (a3)
+; CHECK-NEXT:    vse32.v v9, (a4)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vse32.v v8, (a5)
+; CHECK-NEXT:    vse32.v v9, (a6)
+; CHECK-NEXT:    ret
   store <4 x i32> <i32 0, i32 0, i32 1, i32 1>, ptr %z0
   store <4 x i32> <i32 0, i32 0, i32 1, i32 undef>, ptr %z1
   store <4 x i32> <i32 0, i32 undef, i32 1, i32 1>, ptr %z2
@@ -788,22 +765,22 @@ define void @buildvec_vid_step1o2_add3_v4i16(ptr %z0, ptr %z1, ptr %z2, ptr %z3,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vmv.v.i v9, 3
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1
 ; CHECK-NEXT:    vadd.vi v8, v8, 3
 ; CHECK-NEXT:    vse16.v v8, (a0)
-; CHECK-NEXT:    vmv.v.i v9, 3
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    vse16.v v8, (a4)
 ; CHECK-NEXT:    vmv.v.i v8, 4
+; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vse16.v v8, (a5)
-; CHECK-NEXT:    li a0, 4
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0
-; CHECK-NEXT:    vse16.v v8, (a6)
+; CHECK-NEXT:    vse16.v v9, (a6)
 ; CHECK-NEXT:    ret
   store <4 x i16> <i16 3, i16 3, i16 4, i16 4>, ptr %z0
   store <4 x i16> <i16 3, i16 3, i16 4, i16 undef>, ptr %z1
@@ -997,21 +974,19 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV32-NEXT:    li a0, 512
-; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 3
 ; RV32-NEXT:    li a1, 240
-; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    li a1, 15
+; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 3
+; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV32-NEXT:    li a1, 15
-; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -1029,19 +1004,17 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.i v0, 3
 ; RV64V-NEXT:    vmv.v.i v9, 0
-; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmv.v.i v8, 12
+; RV64V-NEXT:    li a1, 48
+; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmv.v.i v12, 3
-; RV64V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64V-NEXT:    vmv.v.i v8, 12
 ; RV64V-NEXT:    vmv1r.v v0, v10
-; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmerge.vim v12, v12, 0, v0
 ; RV64V-NEXT:    vmv1r.v v0, v8
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV64V-NEXT:    li a1, 48
 ; RV64V-NEXT:    vmv.s.x v8, a1
 ; RV64V-NEXT:    vmv.v.v v0, v10
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -1059,21 +1032,19 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32-NEXT:    vmv.v.i v9, 0
-; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV64ZVE32-NEXT:    li a0, 512
-; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64ZVE32-NEXT:    vmv.v.i v12, 3
 ; RV64ZVE32-NEXT:    li a1, 240
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV64ZVE32-NEXT:    vmv.s.x v8, a1
+; RV64ZVE32-NEXT:    li a1, 15
+; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v12, 3
+; RV64ZVE32-NEXT:    slli a1, a1, 8
 ; RV64ZVE32-NEXT:    vmv1r.v v0, v10
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 0, v0
 ; RV64ZVE32-NEXT:    vmv1r.v v0, v8
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV64ZVE32-NEXT:    li a1, 15
-; RV64ZVE32-NEXT:    slli a1, a1, 8
 ; RV64ZVE32-NEXT:    vmv.s.x v8, a1
 ; RV64ZVE32-NEXT:    vmv.v.v v0, v10
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -1138,22 +1109,22 @@ define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2)
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v10, v9, a2
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v10, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: v4xi64_exact:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
+; RV64V-NEXT:    vmv.v.x v10, a0
 ; RV64V-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    vslide1down.vx v8, v10, a1
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: v4xi64_exact:
@@ -1187,20 +1158,20 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i
 ; RV32-NEXT:    lw s0, 16(sp)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v10, v9, a2
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v10, a3
 ; RV32-NEXT:    vmv.v.x v10, s0
-; RV32-NEXT:    vslide1down.vx v10, v10, t6
-; RV32-NEXT:    vslide1down.vx v10, v10, t5
-; RV32-NEXT:    vslide1down.vx v10, v10, t4
 ; RV32-NEXT:    vmv.v.x v11, t3
+; RV32-NEXT:    vslide1down.vx v10, v10, t6
 ; RV32-NEXT:    vslide1down.vx v11, v11, t2
+; RV32-NEXT:    vslide1down.vx v10, v10, t5
 ; RV32-NEXT:    vslide1down.vx v11, v11, t1
+; RV32-NEXT:    vslide1down.vx v10, v10, t4
 ; RV32-NEXT:    vslide1down.vx v11, v11, t0
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
@@ -1212,13 +1183,13 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
+; RV64V-NEXT:    vmv.v.x v10, a0
+; RV64V-NEXT:    vmv.v.x v11, a4
+; RV64V-NEXT:    vmv.v.x v12, a6
 ; RV64V-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
-; RV64V-NEXT:    vmv.v.x v10, a4
-; RV64V-NEXT:    vslide1down.vx v10, v10, a5
-; RV64V-NEXT:    vmv.v.x v11, a6
-; RV64V-NEXT:    vslide1down.vx v11, v11, a7
+; RV64V-NEXT:    vslide1down.vx v8, v10, a1
+; RV64V-NEXT:    vslide1down.vx v10, v11, a5
+; RV64V-NEXT:    vslide1down.vx v11, v12, a7
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: v8xi64_exact:
@@ -1249,13 +1220,13 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v10, v9, a2
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v10, a3
 ; RV32-NEXT:    vmv.v.v v10, v8
 ; RV32-NEXT:    vmv.v.v v11, v9
 ; RV32-NEXT:    ret
@@ -1264,9 +1235,9 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
+; RV64V-NEXT:    vmv.v.x v10, a0
 ; RV64V-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    vslide1down.vx v8, v10, a1
 ; RV64V-NEXT:    vmv.v.v v10, v8
 ; RV64V-NEXT:    vmv.v.v v11, v9
 ; RV64V-NEXT:    ret
@@ -1298,22 +1269,22 @@ define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v10, v9, a2
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v10, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: v8xi64_exact_undef_suffix:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
+; RV64V-NEXT:    vmv.v.x v10, a0
 ; RV64V-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    vslide1down.vx v8, v10, a1
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: v8xi64_exact_undef_suffix:
@@ -1335,22 +1306,22 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v9, v9, a2
 ; RV32-NEXT:    vslide1down.vx v11, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v10, v8, a3
+; RV32-NEXT:    vslide1down.vx v10, v9, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: v8xi64_exact_undef_prefix:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
+; RV64V-NEXT:    vmv.v.x v9, a0
 ; RV64V-NEXT:    vslide1down.vx v11, v8, a3
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v10, v8, a1
+; RV64V-NEXT:    vslide1down.vx v10, v9, a1
 ; RV64V-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: v8xi64_exact_undef_prefix:
@@ -1387,32 +1358,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu t2, 9(a0)
 ; RV32-ONLY-NEXT:    lbu t3, 10(a0)
 ; RV32-ONLY-NEXT:    lbu t4, 11(a0)
+; RV32-ONLY-NEXT:    li t5, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV32-ONLY-NEXT:    lbu t5, 12(a0)
 ; RV32-ONLY-NEXT:    lbu t6, 13(a0)
 ; RV32-ONLY-NEXT:    lbu s0, 14(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 15(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV32-ONLY-NEXT:    vmv.v.x v8, t1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV32-ONLY-NEXT:    li a1, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a1
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, s0
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    .cfi_restore s0
 ; RV32-ONLY-NEXT:    addi sp, sp, 16
@@ -1421,45 +1391,45 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_contigous:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
+; RV32VB-NEXT:    lbu a1, 0(a0)
+; RV32VB-NEXT:    lbu a2, 1(a0)
 ; RV32VB-NEXT:    lbu a3, 2(a0)
 ; RV32VB-NEXT:    lbu a4, 3(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    lbu a5, 4(a0)
+; RV32VB-NEXT:    lbu a6, 5(a0)
+; RV32VB-NEXT:    lbu a7, 6(a0)
+; RV32VB-NEXT:    lbu t0, 7(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a2, 4(a0)
-; RV32VB-NEXT:    lbu a4, 5(a0)
-; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a3, 6(a0)
-; RV32VB-NEXT:    lbu a5, 7(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    or a2, a2, a4
-; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a3, a5, a3
+; RV32VB-NEXT:    or a2, a5, a6
 ; RV32VB-NEXT:    lbu a4, 8(a0)
 ; RV32VB-NEXT:    lbu a5, 9(a0)
-; RV32VB-NEXT:    or a2, a2, a3
-; RV32VB-NEXT:    lbu a3, 10(a0)
-; RV32VB-NEXT:    lbu a6, 11(a0)
+; RV32VB-NEXT:    lbu a6, 10(a0)
+; RV32VB-NEXT:    lbu t1, 11(a0)
+; RV32VB-NEXT:    slli a7, a7, 16
+; RV32VB-NEXT:    slli t0, t0, 24
 ; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    slli a6, a6, 16
+; RV32VB-NEXT:    slli t1, t1, 24
+; RV32VB-NEXT:    or a7, t0, a7
 ; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a3, a6, a3
 ; RV32VB-NEXT:    lbu a5, 12(a0)
-; RV32VB-NEXT:    lbu a6, 13(a0)
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a4, 14(a0)
+; RV32VB-NEXT:    lbu t0, 13(a0)
+; RV32VB-NEXT:    or a6, t1, a6
+; RV32VB-NEXT:    lbu t1, 14(a0)
 ; RV32VB-NEXT:    lbu a0, 15(a0)
-; RV32VB-NEXT:    slli a6, a6, 8
-; RV32VB-NEXT:    or a5, a5, a6
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    slli t0, t0, 8
+; RV32VB-NEXT:    or a5, a5, t0
+; RV32VB-NEXT:    slli t1, t1, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a0, a0, t1
+; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    or a2, a2, a7
+; RV32VB-NEXT:    or a3, a4, a6
 ; RV32VB-NEXT:    or a0, a5, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
@@ -1474,34 +1444,34 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 2(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 3(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 4(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 5(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 6(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 7(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 4(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 5(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 6(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 7(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 8(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 9(a0)
+; RV32VB-PACK-NEXT:    lbu t2, 10(a0)
+; RV32VB-PACK-NEXT:    lbu t3, 11(a0)
 ; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    pack a1, a1, a3
-; RV32VB-PACK-NEXT:    packh a2, a2, a5
-; RV32VB-PACK-NEXT:    packh a3, a6, a7
-; RV32VB-PACK-NEXT:    lbu a4, 8(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 9(a0)
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
-; RV32VB-PACK-NEXT:    lbu a3, 10(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 11(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    lbu a5, 12(a0)
+; RV32VB-PACK-NEXT:    packh a4, a5, a6
+; RV32VB-PACK-NEXT:    packh a5, a7, t0
+; RV32VB-PACK-NEXT:    lbu a6, 12(a0)
 ; RV32VB-PACK-NEXT:    lbu a7, 13(a0)
 ; RV32VB-PACK-NEXT:    lbu t0, 14(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 15(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a6
-; RV32VB-PACK-NEXT:    pack a3, a4, a3
-; RV32VB-PACK-NEXT:    packh a4, a5, a7
+; RV32VB-PACK-NEXT:    packh a2, a2, t1
+; RV32VB-PACK-NEXT:    packh t1, t2, t3
+; RV32VB-PACK-NEXT:    packh a6, a6, a7
 ; RV32VB-PACK-NEXT:    packh a0, t0, a0
-; RV32VB-PACK-NEXT:    pack a0, a4, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    pack a3, a4, a5
+; RV32VB-PACK-NEXT:    pack a2, a2, t1
+; RV32VB-PACK-NEXT:    pack a0, a6, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -1523,32 +1493,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu t2, 9(a0)
 ; RV64V-ONLY-NEXT:    lbu t3, 10(a0)
 ; RV64V-ONLY-NEXT:    lbu t4, 11(a0)
+; RV64V-ONLY-NEXT:    li t5, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV64V-ONLY-NEXT:    lbu t5, 12(a0)
 ; RV64V-ONLY-NEXT:    lbu t6, 13(a0)
 ; RV64V-ONLY-NEXT:    lbu s0, 14(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 15(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV64V-ONLY-NEXT:    vmv.v.x v8, t1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV64V-ONLY-NEXT:    li a1, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, s0
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    .cfi_restore s0
 ; RV64V-ONLY-NEXT:    addi sp, sp, 16
@@ -1557,52 +1526,52 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_contigous:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
+; RVA22U64-NEXT:    lbu a6, 0(a0)
+; RVA22U64-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-NEXT:    lbu a3, 2(a0)
 ; RVA22U64-NEXT:    lbu a4, 3(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a5, 4(a0)
+; RVA22U64-NEXT:    lbu a1, 5(a0)
+; RVA22U64-NEXT:    lbu a7, 6(a0)
+; RVA22U64-NEXT:    lbu t0, 7(a0)
+; RVA22U64-NEXT:    slli a2, a2, 8
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a2, 4(a0)
-; RVA22U64-NEXT:    lbu a3, 5(a0)
-; RVA22U64-NEXT:    lbu a4, 6(a0)
-; RVA22U64-NEXT:    lbu a5, 7(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 48
-; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    lbu a3, 8(a0)
-; RVA22U64-NEXT:    lbu a4, 9(a0)
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    slli a5, a5, 32
+; RVA22U64-NEXT:    slli a1, a1, 40
+; RVA22U64-NEXT:    or a6, a6, a2
+; RVA22U64-NEXT:    or t2, a4, a3
+; RVA22U64-NEXT:    or t1, a1, a5
+; RVA22U64-NEXT:    lbu a4, 8(a0)
+; RVA22U64-NEXT:    lbu a5, 9(a0)
 ; RVA22U64-NEXT:    lbu a2, 10(a0)
-; RVA22U64-NEXT:    lbu a5, 11(a0)
-; RVA22U64-NEXT:    slli a4, a4, 8
-; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    lbu a1, 11(a0)
+; RVA22U64-NEXT:    slli a7, a7, 48
+; RVA22U64-NEXT:    slli t0, t0, 56
+; RVA22U64-NEXT:    slli a5, a5, 8
 ; RVA22U64-NEXT:    slli a2, a2, 16
-; RVA22U64-NEXT:    slli a5, a5, 24
-; RVA22U64-NEXT:    or a2, a2, a5
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    lbu a3, 12(a0)
-; RVA22U64-NEXT:    lbu a4, 13(a0)
-; RVA22U64-NEXT:    lbu a5, 14(a0)
+; RVA22U64-NEXT:    slli a1, a1, 24
+; RVA22U64-NEXT:    or a7, t0, a7
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a2, 12(a0)
+; RVA22U64-NEXT:    lbu a5, 13(a0)
+; RVA22U64-NEXT:    lbu a3, 14(a0)
 ; RVA22U64-NEXT:    lbu a0, 15(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
-; RVA22U64-NEXT:    slli a4, a4, 40
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a5, a5, 48
+; RVA22U64-NEXT:    slli a2, a2, 32
+; RVA22U64-NEXT:    slli a5, a5, 40
+; RVA22U64-NEXT:    or a2, a2, a5
+; RVA22U64-NEXT:    slli a3, a3, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a5
 ; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a3, a6, t2
+; RVA22U64-NEXT:    or a5, a7, t1
+; RVA22U64-NEXT:    or a1, a1, a4
 ; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a3, a3, a5
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a1
+; RVA22U64-NEXT:    vmv.v.x v8, a3
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
@@ -1611,35 +1580,35 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-PACK-NEXT:    lbu a6, 2(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 3(a0)
-; RVA22U64-PACK-NEXT:    packh a7, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 4(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 5(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 6(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 7(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a6, a4
-; RVA22U64-PACK-NEXT:    packw a4, a7, a4
-; RVA22U64-PACK-NEXT:    packh a2, a2, a5
-; RVA22U64-PACK-NEXT:    packh a1, a3, a1
-; RVA22U64-PACK-NEXT:    packw a1, a2, a1
-; RVA22U64-PACK-NEXT:    lbu a2, 8(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 9(a0)
-; RVA22U64-PACK-NEXT:    pack a6, a4, a1
-; RVA22U64-PACK-NEXT:    lbu a7, 10(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 11(a0)
-; RVA22U64-PACK-NEXT:    packh a2, a2, a3
-; RVA22U64-PACK-NEXT:    lbu a3, 12(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 13(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 14(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 3(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 4(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 5(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 6(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 7(a0)
+; RVA22U64-PACK-NEXT:    packh t1, a1, a2
+; RVA22U64-PACK-NEXT:    lbu t2, 8(a0)
+; RVA22U64-PACK-NEXT:    lbu t3, 9(a0)
+; RVA22U64-PACK-NEXT:    lbu t4, 10(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 11(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a6, a7
+; RVA22U64-PACK-NEXT:    packh a7, t0, a3
+; RVA22U64-PACK-NEXT:    packh t0, a4, a5
+; RVA22U64-PACK-NEXT:    lbu a5, 12(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 13(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 14(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 15(a0)
-; RVA22U64-PACK-NEXT:    packh a5, a7, a5
-; RVA22U64-PACK-NEXT:    packw a2, a2, a5
-; RVA22U64-PACK-NEXT:    packh a1, a3, a1
-; RVA22U64-PACK-NEXT:    packh a0, a4, a0
-; RVA22U64-PACK-NEXT:    packw a0, a1, a0
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    packh a4, t2, t3
+; RVA22U64-PACK-NEXT:    packh a1, t4, a1
+; RVA22U64-PACK-NEXT:    packh a3, a5, a3
+; RVA22U64-PACK-NEXT:    packh a0, a2, a0
+; RVA22U64-PACK-NEXT:    packw a2, t1, a6
+; RVA22U64-PACK-NEXT:    packw a5, a7, t0
+; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    packw a0, a3, a0
+; RVA22U64-PACK-NEXT:    pack a2, a2, a5
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a6
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -1661,32 +1630,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu t2, 9(a0)
 ; RV64ZVE32-NEXT:    lbu t3, 10(a0)
 ; RV64ZVE32-NEXT:    lbu t4, 11(a0)
+; RV64ZVE32-NEXT:    li t5, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, t5
 ; RV64ZVE32-NEXT:    lbu t5, 12(a0)
 ; RV64ZVE32-NEXT:    lbu t6, 13(a0)
 ; RV64ZVE32-NEXT:    lbu s0, 14(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 15(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.x v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, t0
-; RV64ZVE32-NEXT:    vmv.v.x v8, t1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t2
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t3
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t5
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, s0
-; RV64ZVE32-NEXT:    li a1, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a1
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, s0
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, t0
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    .cfi_restore s0
 ; RV64ZVE32-NEXT:    addi sp, sp, 16
@@ -1764,32 +1732,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu t2, 154(a0)
 ; RV32-ONLY-NEXT:    lbu t3, 161(a0)
 ; RV32-ONLY-NEXT:    lbu t4, 163(a0)
+; RV32-ONLY-NEXT:    li t5, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV32-ONLY-NEXT:    lbu t5, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t6, 105(a0)
 ; RV32-ONLY-NEXT:    lbu s0, 124(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 144(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, s0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV32-ONLY-NEXT:    vmv.v.x v8, t1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    li a0, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a0
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, t2
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    .cfi_restore s0
 ; RV32-ONLY-NEXT:    addi sp, sp, 16
@@ -1798,50 +1765,50 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_gather:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
+; RV32VB-NEXT:    lbu a1, 0(a0)
+; RV32VB-NEXT:    lbu a2, 1(a0)
 ; RV32VB-NEXT:    lbu a3, 22(a0)
 ; RV32VB-NEXT:    lbu a4, 31(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    lbu a5, 623(a0)
+; RV32VB-NEXT:    lbu a6, 44(a0)
+; RV32VB-NEXT:    lbu a7, 55(a0)
+; RV32VB-NEXT:    lbu t0, 75(a0)
+; RV32VB-NEXT:    lbu t1, 82(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a2, 44(a0)
-; RV32VB-NEXT:    lbu a3, 55(a0)
-; RV32VB-NEXT:    lbu a4, 623(a0)
-; RV32VB-NEXT:    lbu a5, 75(a0)
-; RV32VB-NEXT:    lbu a6, 82(a0)
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    or a2, a2, a3
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    or a2, a2, a4
-; RV32VB-NEXT:    lbu a3, 93(a0)
+; RV32VB-NEXT:    lbu a2, 93(a0)
 ; RV32VB-NEXT:    lbu a4, 105(a0)
-; RV32VB-NEXT:    lbu a5, 124(a0)
-; RV32VB-NEXT:    lbu a7, 144(a0)
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    lbu t0, 154(a0)
-; RV32VB-NEXT:    lbu t1, 161(a0)
-; RV32VB-NEXT:    or a3, a6, a3
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    lbu t2, 124(a0)
+; RV32VB-NEXT:    lbu t3, 144(a0)
+; RV32VB-NEXT:    slli a7, a7, 8
+; RV32VB-NEXT:    slli a5, a5, 16
+; RV32VB-NEXT:    slli t0, t0, 24
+; RV32VB-NEXT:    slli a2, a2, 8
+; RV32VB-NEXT:    or a6, a6, a7
+; RV32VB-NEXT:    or a5, t0, a5
+; RV32VB-NEXT:    lbu a7, 154(a0)
+; RV32VB-NEXT:    lbu t0, 161(a0)
+; RV32VB-NEXT:    or a2, t1, a2
 ; RV32VB-NEXT:    lbu a0, 163(a0)
-; RV32VB-NEXT:    slli t1, t1, 24
-; RV32VB-NEXT:    or a4, t1, a4
-; RV32VB-NEXT:    or a3, a3, a4
-; RV32VB-NEXT:    slli a0, a0, 8
-; RV32VB-NEXT:    or a0, a5, a0
-; RV32VB-NEXT:    slli a7, a7, 16
+; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli t0, t0, 24
-; RV32VB-NEXT:    or a4, t0, a7
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a4, t0, a4
+; RV32VB-NEXT:    slli a0, a0, 8
+; RV32VB-NEXT:    or a0, t2, a0
+; RV32VB-NEXT:    slli t3, t3, 16
+; RV32VB-NEXT:    slli a7, a7, 24
+; RV32VB-NEXT:    or a7, a7, t3
+; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    or a3, a6, a5
+; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    or a0, a0, a7
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -1851,34 +1818,34 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 22(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 55(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 75(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 82(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    lbu a3, 623(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 75(a0)
-; RV32VB-PACK-NEXT:    pack a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 82(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    packh a3, a3, a6
-; RV32VB-PACK-NEXT:    pack a3, a4, a3
-; RV32VB-PACK-NEXT:    lbu a4, 154(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 163(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 105(a0)
-; RV32VB-PACK-NEXT:    lbu t1, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 154(a0)
+; RV32VB-PACK-NEXT:    lbu t2, 161(a0)
+; RV32VB-PACK-NEXT:    lbu t3, 163(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a5, a5, t0
+; RV32VB-PACK-NEXT:    lbu a6, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 105(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 124(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 144(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, a7
-; RV32VB-PACK-NEXT:    packh a5, t0, a5
-; RV32VB-PACK-NEXT:    pack a2, a2, a5
-; RV32VB-PACK-NEXT:    packh a5, t1, a6
-; RV32VB-PACK-NEXT:    packh a0, a0, a4
-; RV32VB-PACK-NEXT:    pack a0, a5, a0
+; RV32VB-PACK-NEXT:    packh a6, t1, a6
+; RV32VB-PACK-NEXT:    packh a7, a7, t2
+; RV32VB-PACK-NEXT:    packh t0, t0, t3
+; RV32VB-PACK-NEXT:    packh a0, a0, a2
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    pack a2, a4, a5
+; RV32VB-PACK-NEXT:    pack a3, a6, a7
+; RV32VB-PACK-NEXT:    pack a0, t0, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -1900,32 +1867,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu t2, 154(a0)
 ; RV64V-ONLY-NEXT:    lbu t3, 161(a0)
 ; RV64V-ONLY-NEXT:    lbu t4, 163(a0)
+; RV64V-ONLY-NEXT:    li t5, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV64V-ONLY-NEXT:    lbu t5, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t6, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu s0, 124(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 144(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, s0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV64V-ONLY-NEXT:    vmv.v.x v8, t1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    li a0, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a0
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, t2
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    .cfi_restore s0
 ; RV64V-ONLY-NEXT:    addi sp, sp, 16
@@ -1934,90 +1900,98 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_gather:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
+; RVA22U64-NEXT:    lbu a1, 0(a0)
+; RVA22U64-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-NEXT:    lbu a3, 22(a0)
 ; RVA22U64-NEXT:    lbu a4, 31(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a6, 623(a0)
+; RVA22U64-NEXT:    lbu t0, 44(a0)
+; RVA22U64-NEXT:    lbu a7, 55(a0)
+; RVA22U64-NEXT:    lbu a5, 75(a0)
+; RVA22U64-NEXT:    lbu t1, 82(a0)
+; RVA22U64-NEXT:    slli a2, a2, 8
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a2, 623(a0)
-; RVA22U64-NEXT:    lbu a3, 44(a0)
-; RVA22U64-NEXT:    lbu a4, 55(a0)
-; RVA22U64-NEXT:    lbu a5, 75(a0)
-; RVA22U64-NEXT:    lbu a6, 82(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
-; RVA22U64-NEXT:    slli a4, a4, 40
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a2, a2, 48
-; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a2, a2, a5
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    or a7, a1, a2
+; RVA22U64-NEXT:    or t2, a1, a2
+; RVA22U64-NEXT:    or t3, a4, a3
 ; RVA22U64-NEXT:    lbu a2, 93(a0)
-; RVA22U64-NEXT:    lbu t0, 105(a0)
-; RVA22U64-NEXT:    lbu a4, 124(a0)
-; RVA22U64-NEXT:    lbu a5, 144(a0)
+; RVA22U64-NEXT:    lbu t4, 105(a0)
+; RVA22U64-NEXT:    lbu t6, 124(a0)
+; RVA22U64-NEXT:    lbu t5, 144(a0)
+; RVA22U64-NEXT:    slli t0, t0, 32
+; RVA22U64-NEXT:    slli a7, a7, 40
+; RVA22U64-NEXT:    slli a6, a6, 48
+; RVA22U64-NEXT:    slli a5, a5, 56
 ; RVA22U64-NEXT:    slli a2, a2, 8
-; RVA22U64-NEXT:    lbu a1, 154(a0)
-; RVA22U64-NEXT:    lbu a3, 161(a0)
-; RVA22U64-NEXT:    or a2, a6, a2
-; RVA22U64-NEXT:    slli t0, t0, 16
+; RVA22U64-NEXT:    or a7, a7, t0
+; RVA22U64-NEXT:    or a5, a5, a6
+; RVA22U64-NEXT:    lbu a3, 154(a0)
+; RVA22U64-NEXT:    lbu a1, 161(a0)
+; RVA22U64-NEXT:    or a2, t1, a2
 ; RVA22U64-NEXT:    lbu a0, 163(a0)
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a3, a3, t0
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    slli t4, t4, 16
+; RVA22U64-NEXT:    slli a1, a1, 24
+; RVA22U64-NEXT:    or a1, a1, t4
+; RVA22U64-NEXT:    slli t6, t6, 32
 ; RVA22U64-NEXT:    slli a0, a0, 40
-; RVA22U64-NEXT:    or a0, a0, a4
-; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    slli a1, a1, 56
-; RVA22U64-NEXT:    or a1, a1, a5
+; RVA22U64-NEXT:    or a0, a0, t6
+; RVA22U64-NEXT:    slli t5, t5, 48
+; RVA22U64-NEXT:    slli a3, a3, 56
+; RVA22U64-NEXT:    or a3, a3, t5
+; RVA22U64-NEXT:    or a4, t2, t3
+; RVA22U64-NEXT:    or a5, a5, a7
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    or a0, a0, a1
-; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a7
+; RVA22U64-NEXT:    vmv.v.x v8, a4
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_gather:
 ; RVA22U64-PACK:       # %bb.0:
+; RVA22U64-PACK-NEXT:    addi sp, sp, -16
+; RVA22U64-PACK-NEXT:    .cfi_def_cfa_offset 16
+; RVA22U64-PACK-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RVA22U64-PACK-NEXT:    .cfi_offset s0, -8
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 22(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a3, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 75(a0)
-; RVA22U64-PACK-NEXT:    packw a2, a6, a2
-; RVA22U64-PACK-NEXT:    lbu a6, 82(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packh a1, a3, a1
-; RVA22U64-PACK-NEXT:    packw a1, a4, a1
-; RVA22U64-PACK-NEXT:    pack a7, a2, a1
-; RVA22U64-PACK-NEXT:    lbu t0, 154(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 163(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 105(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu t3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu t4, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu t5, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu t1, 82(a0)
+; RVA22U64-PACK-NEXT:    packh t2, a1, a2
+; RVA22U64-PACK-NEXT:    lbu t6, 154(a0)
+; RVA22U64-PACK-NEXT:    lbu s0, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 163(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a6, a7
+; RVA22U64-PACK-NEXT:    packh a7, t3, t4
+; RVA22U64-PACK-NEXT:    packh a2, t0, t5
+; RVA22U64-PACK-NEXT:    lbu a4, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 124(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 144(a0)
-; RVA22U64-PACK-NEXT:    packh a5, a6, a5
+; RVA22U64-PACK-NEXT:    packh a4, t1, a4
+; RVA22U64-PACK-NEXT:    packh a5, a5, s0
 ; RVA22U64-PACK-NEXT:    packh a1, a1, a3
-; RVA22U64-PACK-NEXT:    packw a1, a5, a1
-; RVA22U64-PACK-NEXT:    packh a2, a2, a4
-; RVA22U64-PACK-NEXT:    packh a0, a0, t0
-; RVA22U64-PACK-NEXT:    packw a0, a2, a0
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
+; RVA22U64-PACK-NEXT:    packh a0, a0, t6
+; RVA22U64-PACK-NEXT:    packw a3, t2, a6
+; RVA22U64-PACK-NEXT:    packw a2, a7, a2
+; RVA22U64-PACK-NEXT:    packw a4, a4, a5
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a1, a3, a2
+; RVA22U64-PACK-NEXT:    pack a0, a4, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a7
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
+; RVA22U64-PACK-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RVA22U64-PACK-NEXT:    .cfi_restore s0
+; RVA22U64-PACK-NEXT:    addi sp, sp, 16
+; RVA22U64-PACK-NEXT:    .cfi_def_cfa_offset 0
 ; RVA22U64-PACK-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: buildvec_v16i8_loads_gather:
@@ -2038,32 +2012,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu t2, 154(a0)
 ; RV64ZVE32-NEXT:    lbu t3, 161(a0)
 ; RV64ZVE32-NEXT:    lbu t4, 163(a0)
+; RV64ZVE32-NEXT:    li t5, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, t5
 ; RV64ZVE32-NEXT:    lbu t5, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t6, 105(a0)
 ; RV64ZVE32-NEXT:    lbu s0, 124(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 144(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.x v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, s0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, t0
-; RV64ZVE32-NEXT:    vmv.v.x v8, t1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t5
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t3
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, s0
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    li a0, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a0
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t2
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a0
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, t0
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, t2
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    .cfi_restore s0
 ; RV64ZVE32-NEXT:    addi sp, sp, 16
@@ -2154,19 +2127,19 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV32VB-NEXT:    lbu a6, 154(a0)
 ; RV32VB-NEXT:    lbu a7, 161(a0)
 ; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    lbu a0, 163(a0)
+; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a7, a7, 24
 ; RV32VB-NEXT:    or a2, a7, a3
-; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    slli a0, a0, 8
 ; RV32VB-NEXT:    or a0, a4, a0
 ; RV32VB-NEXT:    slli a5, a5, 16
 ; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a2, a6, a5
-; RV32VB-NEXT:    or a0, a0, a2
+; RV32VB-NEXT:    or a3, a6, a5
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.i v8, 0
+; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    or a0, a0, a3
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
@@ -2174,26 +2147,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RV32VB-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RV32VB-PACK:       # %bb.0:
-; RV32VB-PACK-NEXT:    lbu a1, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a2, 154(a0)
-; RV32VB-PACK-NEXT:    lbu a3, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 105(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 163(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    packh a3, a6, a3
-; RV32VB-PACK-NEXT:    pack a3, a4, a3
-; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    lbu a1, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a3, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 163(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 144(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    pack a0, a0, a1
-; RV32VB-PACK-NEXT:    packh a1, a0, a0
-; RV32VB-PACK-NEXT:    pack a1, a1, a1
+; RV32VB-PACK-NEXT:    packh a2, a3, a5
+; RV32VB-PACK-NEXT:    packh a3, a4, a6
+; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a2
+; RV32VB-PACK-NEXT:    packh a2, a0, a0
+; RV32VB-PACK-NEXT:    pack a2, a2, a2
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
+; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
+; RV32VB-PACK-NEXT:    pack a0, a3, a0
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -2229,17 +2202,17 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RVA22U64-NEXT:    lbu a2, 154(a0)
 ; RVA22U64-NEXT:    lbu a3, 161(a0)
 ; RVA22U64-NEXT:    or a1, a6, a1
-; RVA22U64-NEXT:    slli a7, a7, 16
 ; RVA22U64-NEXT:    lbu a0, 163(a0)
+; RVA22U64-NEXT:    slli a7, a7, 16
 ; RVA22U64-NEXT:    slli a3, a3, 24
 ; RVA22U64-NEXT:    or a3, a3, a7
-; RVA22U64-NEXT:    or a1, a1, a3
 ; RVA22U64-NEXT:    slli a4, a4, 32
 ; RVA22U64-NEXT:    slli a0, a0, 40
 ; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    slli a5, a5, 48
 ; RVA22U64-NEXT:    slli a2, a2, 56
 ; RVA22U64-NEXT:    or a2, a2, a5
+; RVA22U64-NEXT:    or a1, a1, a3
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -2249,26 +2222,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a6, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 154(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 105(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 124(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 163(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packh a1, a1, a3
-; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    lbu a6, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 163(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a6, a7
+; RVA22U64-PACK-NEXT:    packh a5, t0, a5
+; RVA22U64-PACK-NEXT:    packh a1, a4, a1
 ; RVA22U64-PACK-NEXT:    packh a0, a2, a0
-; RVA22U64-PACK-NEXT:    packh a2, a6, a7
-; RVA22U64-PACK-NEXT:    packw a0, a0, a2
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
-; RVA22U64-PACK-NEXT:    packh a1, a0, a0
-; RVA22U64-PACK-NEXT:    packw a1, a1, a1
-; RVA22U64-PACK-NEXT:    pack a1, a1, a1
+; RVA22U64-PACK-NEXT:    packw a2, a3, a5
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packw a3, a3, a3
+; RVA22U64-PACK-NEXT:    pack a3, a3, a3
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a3
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2347,25 +2320,25 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-LABEL: buildvec_v16i8_undef_high_half:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
-; RV32VB-NEXT:    lbu a3, 22(a0)
-; RV32VB-NEXT:    lbu a4, 31(a0)
+; RV32VB-NEXT:    lbu a2, 22(a0)
+; RV32VB-NEXT:    lbu a3, 31(a0)
+; RV32VB-NEXT:    lbu a4, 0(a0)
 ; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a4, a4, 24
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a2, 44(a0)
-; RV32VB-NEXT:    lbu a4, 55(a0)
-; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    slli a2, a2, 16
+; RV32VB-NEXT:    slli a3, a3, 24
+; RV32VB-NEXT:    or a1, a4, a1
+; RV32VB-NEXT:    lbu a4, 44(a0)
+; RV32VB-NEXT:    lbu a5, 55(a0)
+; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    lbu a3, 623(a0)
 ; RV32VB-NEXT:    lbu a0, 75(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a4, a4, a5
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
 ; RV32VB-NEXT:    or a0, a0, a3
-; RV32VB-NEXT:    or a0, a2, a0
+; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    or a0, a4, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
@@ -2379,21 +2352,21 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 22(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
-; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 623(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 55(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 75(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    pack a1, a1, a3
-; RV32VB-PACK-NEXT:    packh a3, a5, a6
-; RV32VB-PACK-NEXT:    packh a0, a2, a0
+; RV32VB-PACK-NEXT:    packh a1, a1, a2
+; RV32VB-PACK-NEXT:    packh a2, a3, a4
+; RV32VB-PACK-NEXT:    packh a3, a6, a7
+; RV32VB-PACK-NEXT:    packh a0, a5, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a2
+; RV32VB-PACK-NEXT:    packh a2, a0, a0
 ; RV32VB-PACK-NEXT:    pack a0, a3, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
-; RV32VB-PACK-NEXT:    packh a0, a0, a0
-; RV32VB-PACK-NEXT:    pack a0, a0, a0
+; RV32VB-PACK-NEXT:    pack a0, a2, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
@@ -2423,26 +2396,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RVA22U64-LABEL: buildvec_v16i8_undef_high_half:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
-; RVA22U64-NEXT:    lbu a3, 22(a0)
-; RVA22U64-NEXT:    lbu a4, 31(a0)
+; RVA22U64-NEXT:    lbu a2, 22(a0)
+; RVA22U64-NEXT:    lbu a3, 31(a0)
+; RVA22U64-NEXT:    lbu a4, 0(a0)
 ; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    slli a3, a3, 16
-; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a2, 44(a0)
-; RVA22U64-NEXT:    lbu a3, 55(a0)
-; RVA22U64-NEXT:    lbu a4, 623(a0)
-; RVA22U64-NEXT:    lbu a0, 75(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    slli a3, a3, 40
+; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a3, a3, 24
+; RVA22U64-NEXT:    or a1, a1, a4
 ; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    lbu a3, 44(a0)
+; RVA22U64-NEXT:    lbu a4, 55(a0)
+; RVA22U64-NEXT:    lbu a5, 623(a0)
+; RVA22U64-NEXT:    lbu a0, 75(a0)
+; RVA22U64-NEXT:    slli a3, a3, 32
+; RVA22U64-NEXT:    slli a4, a4, 40
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    slli a5, a5, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a4
-; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a0, a0, a5
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    or a0, a0, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a0
@@ -2451,26 +2424,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_high_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a6, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 0(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 1(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 22(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
-; RVA22U64-PACK-NEXT:    packh a1, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a6, a4
-; RVA22U64-PACK-NEXT:    packw a1, a1, a4
-; RVA22U64-PACK-NEXT:    packh a3, a5, a3
-; RVA22U64-PACK-NEXT:    packh a0, a2, a0
-; RVA22U64-PACK-NEXT:    packw a0, a3, a0
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
+; RVA22U64-PACK-NEXT:    packh a3, a6, a7
+; RVA22U64-PACK-NEXT:    packh a4, t0, a4
+; RVA22U64-PACK-NEXT:    packh a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a0, a5, a0
+; RVA22U64-PACK-NEXT:    packw a2, a3, a4
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packw a3, a3, a3
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a0
-; RVA22U64-PACK-NEXT:    packh a0, a0, a0
-; RVA22U64-PACK-NEXT:    packw a0, a0, a0
-; RVA22U64-PACK-NEXT:    pack a0, a0, a0
+; RVA22U64-PACK-NEXT:    pack a0, a3, a3
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2531,54 +2504,53 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a3, 44(a0)
 ; RV32-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV32-ONLY-NEXT:    lbu a5, 75(a0)
+; RV32-ONLY-NEXT:    li a6, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, a6
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 161(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a2
+; RV32-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV32-ONLY-NEXT:    vmv.v.x v8, a6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    li a0, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a0
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 4
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_edges:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 55(a0)
-; RV32VB-NEXT:    lbu a2, 31(a0)
-; RV32VB-NEXT:    lbu a3, 44(a0)
-; RV32VB-NEXT:    lbu a4, 623(a0)
-; RV32VB-NEXT:    lbu a5, 75(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    lbu a1, 623(a0)
+; RV32VB-NEXT:    lbu a2, 55(a0)
+; RV32VB-NEXT:    lbu a3, 75(a0)
+; RV32VB-NEXT:    lbu a4, 31(a0)
+; RV32VB-NEXT:    lbu a5, 44(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
+; RV32VB-NEXT:    slli a1, a1, 16
+; RV32VB-NEXT:    slli a3, a3, 24
+; RV32VB-NEXT:    or a2, a5, a2
+; RV32VB-NEXT:    lbu a5, 82(a0)
+; RV32VB-NEXT:    lbu a6, 93(a0)
 ; RV32VB-NEXT:    or a1, a3, a1
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    lbu a3, 82(a0)
-; RV32VB-NEXT:    lbu a5, 93(a0)
-; RV32VB-NEXT:    or a1, a1, a4
-; RV32VB-NEXT:    lbu a4, 105(a0)
+; RV32VB-NEXT:    lbu a3, 105(a0)
 ; RV32VB-NEXT:    lbu a0, 161(a0)
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a3, a3, a5
-; RV32VB-NEXT:    slli a2, a2, 24
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a5, a5, a6
+; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
-; RV32VB-NEXT:    or a0, a3, a0
+; RV32VB-NEXT:    or a0, a0, a3
+; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    or a0, a5, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-NEXT:    vmv.v.x v8, a2
+; RV32VB-NEXT:    vmv.v.x v8, a4
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
@@ -2591,18 +2563,18 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
-; RV32VB-PACK-NEXT:    packh a2, a0, a2
+; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
 ; RV32VB-PACK-NEXT:    packh a3, a3, a4
 ; RV32VB-PACK-NEXT:    packh a1, a1, a5
-; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    pack a1, a3, a1
-; RV32VB-PACK-NEXT:    lbu a3, 105(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a0, t0, a0
 ; RV32VB-PACK-NEXT:    packh a5, a0, a0
+; RV32VB-PACK-NEXT:    packh a2, a0, a2
 ; RV32VB-PACK-NEXT:    pack a2, a5, a2
-; RV32VB-PACK-NEXT:    packh a0, a3, a0
+; RV32VB-PACK-NEXT:    pack a1, a3, a1
 ; RV32VB-PACK-NEXT:    pack a0, a4, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
@@ -2619,84 +2591,83 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a3, 44(a0)
 ; RV64V-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV64V-ONLY-NEXT:    lbu a5, 75(a0)
+; RV64V-ONLY-NEXT:    li a6, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a6
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 161(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a2
+; RV64V-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a1
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    li a0, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a0
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 4
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 4
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 31(a0)
+; RVA22U64-NEXT:    lbu a6, 31(a0)
 ; RVA22U64-NEXT:    lbu a2, 44(a0)
 ; RVA22U64-NEXT:    lbu a3, 55(a0)
 ; RVA22U64-NEXT:    lbu a4, 623(a0)
 ; RVA22U64-NEXT:    lbu a5, 75(a0)
 ; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a1, a1, 24
 ; RVA22U64-NEXT:    slli a4, a4, 48
 ; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    or a2, a2, a4
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    lbu a3, 82(a0)
-; RVA22U64-NEXT:    lbu a4, 93(a0)
-; RVA22U64-NEXT:    add.uw a1, a1, a2
-; RVA22U64-NEXT:    lbu a2, 105(a0)
+; RVA22U64-NEXT:    lbu a1, 93(a0)
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    lbu a5, 105(a0)
 ; RVA22U64-NEXT:    lbu a0, 161(a0)
-; RVA22U64-NEXT:    slli a4, a4, 8
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    slli a5, a5, 16
 ; RVA22U64-NEXT:    slli a0, a0, 24
-; RVA22U64-NEXT:    or a0, a0, a2
-; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a0, a0, a5
+; RVA22U64-NEXT:    slli a6, a6, 24
+; RVA22U64-NEXT:    or a2, a2, a4
+; RVA22U64-NEXT:    add.uw a2, a6, a2
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a1
+; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 44(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a0, a2
-; RVA22U64-PACK-NEXT:    packh a2, a0, a0
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    packh a1, a1, a5
-; RVA22U64-PACK-NEXT:    packw a7, a3, a1
-; RVA22U64-PACK-NEXT:    lbu a3, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 105(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 161(a0)
-; RVA22U64-PACK-NEXT:    packw a1, a2, a6
-; RVA22U64-PACK-NEXT:    pack a1, a1, a7
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    packh a0, a5, a0
-; RVA22U64-PACK-NEXT:    packw a0, a3, a0
+; RVA22U64-PACK-NEXT:    packh a4, t0, a4
+; RVA22U64-PACK-NEXT:    packh a5, a7, a5
+; RVA22U64-PACK-NEXT:    packh a1, a2, a1
+; RVA22U64-PACK-NEXT:    packh a0, a3, a0
+; RVA22U64-PACK-NEXT:    packh a2, a0, a0
+; RVA22U64-PACK-NEXT:    packh a3, a0, a6
+; RVA22U64-PACK-NEXT:    packw a3, a2, a3
+; RVA22U64-PACK-NEXT:    packw a2, a2, a2
+; RVA22U64-PACK-NEXT:    packw a4, a4, a5
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a1, a3, a4
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
-; RVA22U64-PACK-NEXT:    packw a1, a2, a2
-; RVA22U64-PACK-NEXT:    pack a0, a0, a1
+; RVA22U64-PACK-NEXT:    pack a0, a0, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2707,26 +2678,25 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a3, 44(a0)
 ; RV64ZVE32-NEXT:    lbu a4, 55(a0)
 ; RV64ZVE32-NEXT:    lbu a5, 75(a0)
+; RV64ZVE32-NEXT:    li a6, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a6
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 105(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 161(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a2
+; RV64ZVE32-NEXT:    vmv.v.x v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a5
-; RV64ZVE32-NEXT:    vmv.v.x v8, a6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t0
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    li a0, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a0
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 4
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a0
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 4
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ret
   %p4 = getelementptr i8, ptr %p, i32 31
   %p5 = getelementptr i8, ptr %p, i32 44
@@ -2771,58 +2741,57 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 124(a0)
+; RV32-ONLY-NEXT:    li t1, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, t1
 ; RV32-ONLY-NEXT:    lbu t1, 144(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 154(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-ONLY-NEXT:    vslidedown.vi v9, v9, 2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslidedown.vi v9, v9, 1
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV32-ONLY-NEXT:    vmv.v.x v8, a6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 2
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t1
-; RV32-ONLY-NEXT:    li a1, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a1
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t1
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    lbu a1, 1(a0)
 ; RV32VB-NEXT:    lbu a2, 0(a0)
-; RV32VB-NEXT:    lbu a3, 55(a0)
-; RV32VB-NEXT:    lbu a4, 44(a0)
+; RV32VB-NEXT:    lbu a3, 44(a0)
+; RV32VB-NEXT:    lbu a4, 55(a0)
 ; RV32VB-NEXT:    slli a1, a1, 8
 ; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    or a3, a4, a3
 ; RV32VB-NEXT:    lbu a2, 75(a0)
-; RV32VB-NEXT:    lbu a4, 82(a0)
-; RV32VB-NEXT:    lbu a5, 93(a0)
-; RV32VB-NEXT:    lbu a6, 124(a0)
-; RV32VB-NEXT:    slli a2, a2, 24
-; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    lbu a3, 144(a0)
+; RV32VB-NEXT:    lbu a5, 82(a0)
+; RV32VB-NEXT:    lbu a6, 93(a0)
+; RV32VB-NEXT:    lbu a7, 124(a0)
+; RV32VB-NEXT:    slli a4, a4, 8
+; RV32VB-NEXT:    or a3, a3, a4
+; RV32VB-NEXT:    lbu a4, 144(a0)
 ; RV32VB-NEXT:    lbu a0, 154(a0)
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    slli a3, a3, 16
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a5, a5, a6
+; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a3
-; RV32VB-NEXT:    or a0, a6, a0
+; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    slli a2, a2, 24
+; RV32VB-NEXT:    or a2, a3, a2
+; RV32VB-NEXT:    or a0, a7, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -2832,26 +2801,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    lbu a3, 75(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
-; RV32VB-PACK-NEXT:    packh a3, a0, a3
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
-; RV32VB-PACK-NEXT:    packh a3, a4, a5
-; RV32VB-PACK-NEXT:    packh a0, a7, a0
-; RV32VB-PACK-NEXT:    packh a4, a6, a0
-; RV32VB-PACK-NEXT:    pack a0, a4, a0
-; RV32VB-PACK-NEXT:    packh a4, a0, a0
-; RV32VB-PACK-NEXT:    pack a1, a1, a4
+; RV32VB-PACK-NEXT:    lbu a2, 144(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 154(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    lbu a0, 124(a0)
+; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a2, a2, t0
+; RV32VB-PACK-NEXT:    packh a5, a0, a5
+; RV32VB-PACK-NEXT:    pack a3, a3, a5
+; RV32VB-PACK-NEXT:    packh a5, a0, a0
+; RV32VB-PACK-NEXT:    packh a0, a0, a0
+; RV32VB-PACK-NEXT:    pack a0, a0, a2
+; RV32VB-PACK-NEXT:    pack a1, a1, a5
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-PACK-NEXT:    pack a1, a3, a4
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-PACK-NEXT:    pack a1, a4, a5
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
@@ -2866,28 +2835,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 124(a0)
+; RV64V-ONLY-NEXT:    li t1, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, t1
 ; RV64V-ONLY-NEXT:    lbu t1, 144(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 154(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 2
+; RV64V-ONLY-NEXT:    vslidedown.vi v9, v9, 2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 2
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t1
-; RV64V-ONLY-NEXT:    li a1, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t1
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered:
@@ -2898,26 +2866,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-NEXT:    slli a1, a1, 8
 ; RVA22U64-NEXT:    or a6, a2, a1
+; RVA22U64-NEXT:    lbu a7, 75(a0)
+; RVA22U64-NEXT:    lbu a5, 82(a0)
+; RVA22U64-NEXT:    lbu a1, 93(a0)
+; RVA22U64-NEXT:    lbu a2, 124(a0)
 ; RVA22U64-NEXT:    slli a3, a3, 32
 ; RVA22U64-NEXT:    slli a4, a4, 40
 ; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a2, 75(a0)
-; RVA22U64-NEXT:    lbu a4, 82(a0)
-; RVA22U64-NEXT:    lbu a5, 93(a0)
-; RVA22U64-NEXT:    lbu a1, 124(a0)
-; RVA22U64-NEXT:    slli a2, a2, 56
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    or a2, a6, a2
-; RVA22U64-NEXT:    lbu a3, 144(a0)
+; RVA22U64-NEXT:    lbu a4, 144(a0)
 ; RVA22U64-NEXT:    lbu a0, 154(a0)
-; RVA22U64-NEXT:    slli a5, a5, 8
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    slli a3, a3, 48
+; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a1, a1, a5
+; RVA22U64-NEXT:    slli a4, a4, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a3
-; RVA22U64-NEXT:    slli a1, a1, 32
-; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a4
+; RVA22U64-NEXT:    slli a7, a7, 56
+; RVA22U64-NEXT:    or a3, a7, a3
+; RVA22U64-NEXT:    slli a2, a2, 32
+; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a2, a6, a3
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
@@ -2927,26 +2895,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a3, 75(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 124(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a0, a3
-; RVA22U64-PACK-NEXT:    packw a2, a2, a3
-; RVA22U64-PACK-NEXT:    packh a3, a0, a0
-; RVA22U64-PACK-NEXT:    lbu a4, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
-; RVA22U64-PACK-NEXT:    packw a1, a6, a3
-; RVA22U64-PACK-NEXT:    pack a1, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a7, a5
-; RVA22U64-PACK-NEXT:    packh a0, a4, a0
-; RVA22U64-PACK-NEXT:    packh a4, t0, a0
-; RVA22U64-PACK-NEXT:    packw a0, a4, a0
-; RVA22U64-PACK-NEXT:    packw a2, a2, a3
+; RVA22U64-PACK-NEXT:    lbu a7, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 93(a0)
+; RVA22U64-PACK-NEXT:    packh t1, a1, a2
+; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 154(a0)
+; RVA22U64-PACK-NEXT:    packh a1, a7, t0
+; RVA22U64-PACK-NEXT:    lbu a0, 124(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a5, a3
+; RVA22U64-PACK-NEXT:    packh a2, a2, a4
+; RVA22U64-PACK-NEXT:    packh a4, a0, a6
+; RVA22U64-PACK-NEXT:    packw a1, a1, a4
+; RVA22U64-PACK-NEXT:    packh a4, a0, a0
+; RVA22U64-PACK-NEXT:    packh a0, a0, a0
+; RVA22U64-PACK-NEXT:    packw a5, t1, a4
+; RVA22U64-PACK-NEXT:    packw a0, a0, a2
+; RVA22U64-PACK-NEXT:    packw a2, a3, a4
+; RVA22U64-PACK-NEXT:    pack a1, a5, a1
 ; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
@@ -2963,28 +2931,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 124(a0)
+; RV64ZVE32-NEXT:    li t1, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, t1
 ; RV64ZVE32-NEXT:    lbu t1, 144(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 154(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.x v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32-NEXT:    vslidedown.vi v9, v9, 2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a5
-; RV64ZVE32-NEXT:    vmv.v.x v8, a6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t0
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t1
-; RV64ZVE32-NEXT:    li a1, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a1
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t1
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ret
   %p2 = getelementptr i8, ptr %p, i32 1
   %p3 = getelementptr i8, ptr %p, i32 22
@@ -3043,91 +3010,91 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV32-ONLY:       # %bb.0:
 ; RV32-ONLY-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a0
+; RV32-ONLY-NEXT:    vmv.v.x v9, a4
+; RV32-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a3
-; RV32-ONLY-NEXT:    vmv.v.x v8, a4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-ONLY-NEXT:    vmv.v.i v0, 15
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a6
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a7
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v8i8_pack:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    slli a7, a7, 24
 ; RV32VB-NEXT:    andi a6, a6, 255
-; RV32VB-NEXT:    slli a6, a6, 16
-; RV32VB-NEXT:    or a6, a7, a6
 ; RV32VB-NEXT:    andi a4, a4, 255
 ; RV32VB-NEXT:    andi a5, a5, 255
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    or a4, a4, a6
 ; RV32VB-NEXT:    slli a3, a3, 24
 ; RV32VB-NEXT:    andi a2, a2, 255
-; RV32VB-NEXT:    slli a2, a2, 16
-; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    andi a0, a0, 255
 ; RV32VB-NEXT:    andi a1, a1, 255
+; RV32VB-NEXT:    slli a6, a6, 16
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    slli a2, a2, 16
 ; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    or a6, a7, a6
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    or a0, a0, a1
+; RV32VB-NEXT:    or a1, a4, a6
 ; RV32VB-NEXT:    or a0, a0, a2
 ; RV32VB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a0
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    ret
 ;
 ; RV32VB-PACK-LABEL: buildvec_v8i8_pack:
 ; RV32VB-PACK:       # %bb.0:
 ; RV32VB-PACK-NEXT:    packh a6, a6, a7
 ; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    pack a4, a4, a6
 ; RV32VB-PACK-NEXT:    packh a2, a2, a3
 ; RV32VB-PACK-NEXT:    packh a0, a0, a1
+; RV32VB-PACK-NEXT:    pack a1, a4, a6
 ; RV32VB-PACK-NEXT:    pack a0, a0, a2
 ; RV32VB-PACK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a0
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a4
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    ret
 ;
 ; RV64V-ONLY-LABEL: buildvec_v8i8_pack:
 ; RV64V-ONLY:       # %bb.0:
 ; RV64V-ONLY-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a0
+; RV64V-ONLY-NEXT:    vmv.v.x v9, a4
+; RV64V-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
-; RV64V-ONLY-NEXT:    vmv.v.i v0, 15
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a6
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a7
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v8i8_pack:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    andi a4, a4, 255
-; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    andi t0, a4, 255
 ; RVA22U64-NEXT:    andi a5, a5, 255
-; RVA22U64-NEXT:    slli a5, a5, 40
-; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    slli a7, a7, 56
-; RVA22U64-NEXT:    andi a5, a6, 255
-; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    or a5, a7, a5
-; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    andi a4, a6, 255
 ; RVA22U64-NEXT:    andi a2, a2, 255
-; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    andi a3, a3, 255
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    andi a0, a0, 255
 ; RVA22U64-NEXT:    andi a1, a1, 255
+; RVA22U64-NEXT:    slli t0, t0, 32
+; RVA22U64-NEXT:    slli a5, a5, 40
+; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a3, a3, 24
 ; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a5, a5, t0
+; RVA22U64-NEXT:    or a4, a7, a4
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -3138,11 +3105,11 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    packh a6, a6, a7
 ; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packw a4, a4, a6
 ; RVA22U64-PACK-NEXT:    packh a2, a2, a3
 ; RVA22U64-PACK-NEXT:    packh a0, a0, a1
+; RVA22U64-PACK-NEXT:    packw a1, a4, a6
 ; RVA22U64-PACK-NEXT:    packw a0, a0, a2
-; RVA22U64-PACK-NEXT:    pack a0, a0, a4
+; RVA22U64-PACK-NEXT:    pack a0, a0, a1
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.s.x v8, a0
 ; RVA22U64-PACK-NEXT:    ret
@@ -3151,15 +3118,15 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV64ZVE32:       # %bb.0:
 ; RV64ZVE32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a0
+; RV64ZVE32-NEXT:    vmv.v.x v9, a4
+; RV64ZVE32-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a3
-; RV64ZVE32-NEXT:    vmv.v.x v8, a4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a6
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a7
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32-NEXT:    ret
   %v1 = insertelement <8 x i8> poison, i8 %e1, i32 0
   %v2 = insertelement <8 x i8> %v1, i8 %e2, i32 1
@@ -3189,32 +3156,32 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    slli a3, a3, 24
 ; RV32VB-NEXT:    andi a2, a2, 255
-; RV32VB-NEXT:    slli a2, a2, 16
-; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    andi a0, a0, 255
 ; RV32VB-NEXT:    andi a1, a1, 255
+; RV32VB-NEXT:    andi a4, a4, 255
+; RV32VB-NEXT:    andi a5, a5, 255
+; RV32VB-NEXT:    slli a2, a2, 16
 ; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    or a0, a0, a1
 ; RV32VB-NEXT:    or a0, a0, a2
-; RV32VB-NEXT:    andi a1, a4, 255
-; RV32VB-NEXT:    andi a2, a5, 255
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    or a4, a4, a5
 ; RV32VB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a0
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32VB-NEXT:    ret
 ;
 ; RV32VB-PACK-LABEL: buildvec_v6i8_pack:
 ; RV32VB-PACK:       # %bb.0:
 ; RV32VB-PACK-NEXT:    packh a2, a2, a3
 ; RV32VB-PACK-NEXT:    packh a0, a0, a1
-; RV32VB-PACK-NEXT:    pack a0, a0, a2
 ; RV32VB-PACK-NEXT:    packh a1, a4, a5
+; RV32VB-PACK-NEXT:    packh a3, a0, a0
+; RV32VB-PACK-NEXT:    pack a0, a0, a2
 ; RV32VB-PACK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a0
-; RV32VB-PACK-NEXT:    packh a0, a0, a0
-; RV32VB-PACK-NEXT:    pack a0, a1, a0
+; RV32VB-PACK-NEXT:    pack a0, a1, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -3233,21 +3200,21 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RVA22U64-LABEL: buildvec_v6i8_pack:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    andi a2, a2, 255
-; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    andi a3, a3, 255
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    andi a0, a0, 255
 ; RVA22U64-NEXT:    andi a1, a1, 255
+; RVA22U64-NEXT:    andi a4, a4, 255
+; RVA22U64-NEXT:    andi a5, a5, 255
+; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a3, a3, 24
 ; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    slli a5, a5, 40
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
-; RVA22U64-NEXT:    andi a1, a4, 255
-; RVA22U64-NEXT:    slli a1, a1, 32
-; RVA22U64-NEXT:    andi a2, a5, 255
-; RVA22U64-NEXT:    slli a2, a2, 40
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.s.x v8, a0
 ; RVA22U64-NEXT:    ret
@@ -3256,10 +3223,10 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    packh a2, a2, a3
 ; RVA22U64-PACK-NEXT:    packh a0, a0, a1
-; RVA22U64-PACK-NEXT:    packw a0, a0, a2
 ; RVA22U64-PACK-NEXT:    packh a1, a4, a5
-; RVA22U64-PACK-NEXT:    packh a2, a0, a0
-; RVA22U64-PACK-NEXT:    packw a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packw a0, a0, a2
+; RVA22U64-PACK-NEXT:    packw a1, a1, a3
 ; RVA22U64-PACK-NEXT:    pack a0, a0, a1
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.s.x v8, a0
@@ -3299,9 +3266,9 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) {
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    zext.h a2, a2
-; RV32VB-NEXT:    or a2, a2, a3
 ; RV32VB-NEXT:    slli a1, a1, 16
 ; RV32VB-NEXT:    zext.h a0, a0
+; RV32VB-NEXT:    or a2, a2, a3
 ; RV32VB-NEXT:    or a0, a0, a1
 ; RV32VB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a0
@@ -3330,11 +3297,11 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) {
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    slli a3, a3, 48
 ; RVA22U64-NEXT:    zext.h a2, a2
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    zext.h a0, a0
 ; RVA22U64-NEXT:    zext.h a1, a1
+; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a1, a1, 16
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -3456,3 +3423,5 @@ define <4 x i1> @buildvec_i1_splat(i1 %e1) {
   ret <4 x i1> %v4
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 6cab1bc2185287..a25014295f9e88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -63,8 +63,8 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, a0, a5
@@ -124,17 +124,17 @@ define i8 @explode_16xi8(<16 x i8> %v) {
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s t6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, t6, a0
 ; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, a5, a7
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a0, a0, a5
 ; CHECK-NEXT:    add t1, t1, t2
+; CHECK-NEXT:    add a0, t6, a0
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a5, a5, a7
 ; CHECK-NEXT:    add t1, t1, t3
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a5, a5, t0
 ; CHECK-NEXT:    add t1, t1, t4
+; CHECK-NEXT:    add a0, a0, a5
 ; CHECK-NEXT:    add t1, t1, t5
 ; CHECK-NEXT:    add a0, a0, t1
 ; CHECK-NEXT:    ret
@@ -233,8 +233,8 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, a0, a5
@@ -260,54 +260,54 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 define i16 @explode_16xi16(<16 x i16> %v) {
 ; CHECK-LABEL: explode_16xi16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
 ; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vslidedown.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vslidedown.vi v10, v8, 10
 ; CHECK-NEXT:    vmv.x.s a2, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vslidedown.vi v10, v8, 11
 ; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
+; CHECK-NEXT:    vslidedown.vi v10, v8, 12
 ; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vslidedown.vi v10, v8, 13
 ; CHECK-NEXT:    vmv.x.s a5, v10
-; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vslidedown.vi v10, v8, 14
 ; CHECK-NEXT:    vmv.x.s a6, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 9
+; CHECK-NEXT:    vslidedown.vi v10, v8, 15
 ; CHECK-NEXT:    vmv.x.s a7, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 10
-; CHECK-NEXT:    vmv.x.s t0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 11
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s t0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 4
 ; CHECK-NEXT:    vmv.x.s t1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 12
-; CHECK-NEXT:    vmv.x.s t2, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 13
+; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vmv.x.s t2, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 6
 ; CHECK-NEXT:    vmv.x.s t3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 14
-; CHECK-NEXT:    vmv.x.s t4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 15
-; CHECK-NEXT:    vmv.x.s t5, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vmv.x.s t4, v9
 ; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vmv.x.s t5, v10
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s t6, v8
+; CHECK-NEXT:    add t0, t0, t1
+; CHECK-NEXT:    add t2, t2, t3
+; CHECK-NEXT:    add a0, t5, a0
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    add t0, t6, t0
+; CHECK-NEXT:    add t2, t2, t4
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, t6, a0
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a3, a3, a5
+; CHECK-NEXT:    add t0, t0, t2
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, a5, a7
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a0, a0, a5
-; CHECK-NEXT:    add t1, t1, t2
-; CHECK-NEXT:    add t1, t1, t3
-; CHECK-NEXT:    add t1, t1, t4
-; CHECK-NEXT:    add t1, t1, t5
-; CHECK-NEXT:    add a0, a0, t1
+; CHECK-NEXT:    add a3, a3, a6
+; CHECK-NEXT:    add a0, t0, a0
+; CHECK-NEXT:    add a3, a3, a7
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    ret
   %e0 = extractelement <16 x i16> %v, i32 0
   %e1 = extractelement <16 x i16> %v, i32 1
@@ -401,58 +401,58 @@ define i32 @explode_4xi32(<4 x i32> %v) {
 define i32 @explode_8xi32(<8 x i32> %v) {
 ; RV32-LABEL: explode_8xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vmv.x.s a2, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 7
-; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a4, v9
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.x.s a5, v10
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a6, v8
+; RV32-NEXT:    add a4, a4, a5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a6, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a4, a6, a4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    add a0, a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vmv.x.s a2, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a4, v9
 ; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vmv.x.s a5, v10
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s a6, v8
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a6, a0
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a4, a6, a4
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    addw a0, a0, a5
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    addw a0, a0, a3
 ; RV64-NEXT:    ret
   %e0 = extractelement <8 x i32> %v, i32 0
   %e1 = extractelement <8 x i32> %v, i32 1
@@ -484,24 +484,27 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    addi s0, sp, 128
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vmv.x.s a2, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vslidedown.vi v13, v8, 3
+; RV32-NEXT:    mv a4, sp
 ; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    mv a6, sp
+; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    vmv.x.s a6, v13
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (a6)
-; RV32-NEXT:    lw a6, 32(sp)
+; RV32-NEXT:    vse32.v v8, (a4)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v12
+; RV32-NEXT:    lw a4, 32(sp)
 ; RV32-NEXT:    lw a7, 36(sp)
 ; RV32-NEXT:    lw t0, 40(sp)
 ; RV32-NEXT:    lw t1, 44(sp)
@@ -509,22 +512,19 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    lw t3, 52(sp)
 ; RV32-NEXT:    lw t4, 56(sp)
 ; RV32-NEXT:    lw t5, 60(sp)
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s t6, v8
+; RV32-NEXT:    add a5, a5, a6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, t6, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a5, t6, a5
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    add a3, a3, a4
 ; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a7, a7, t1
 ; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    add t2, t2, t5
 ; RV32-NEXT:    add a0, a0, t2
 ; RV32-NEXT:    addi sp, s0, -128
@@ -548,24 +548,27 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    addi s0, sp, 128
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vslidedown.vi v13, v8, 3
+; RV64-NEXT:    mv a4, sp
 ; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    mv a6, sp
+; RV64-NEXT:    vmv.s.x v12, zero
+; RV64-NEXT:    vmv.x.s a6, v13
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (a6)
-; RV64-NEXT:    lw a6, 32(sp)
+; RV64-NEXT:    vse32.v v8, (a4)
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v12
+; RV64-NEXT:    lw a4, 32(sp)
 ; RV64-NEXT:    lw a7, 36(sp)
 ; RV64-NEXT:    lw t0, 40(sp)
 ; RV64-NEXT:    lw t1, 44(sp)
@@ -573,22 +576,19 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    lw t3, 52(sp)
 ; RV64-NEXT:    lw t4, 56(sp)
 ; RV64-NEXT:    lw t5, 60(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s t6, v8
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a5, t6, a5
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a0, a0, a5
+; RV64-NEXT:    add a0, a5, a0
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a0, a0, a7
 ; RV64-NEXT:    add t2, t2, t3
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a7, a7, t1
 ; RV64-NEXT:    add t2, t2, t4
+; RV64-NEXT:    add a0, a0, a7
 ; RV64-NEXT:    add t2, t2, t5
 ; RV64-NEXT:    addw a0, a0, t2
 ; RV64-NEXT:    addi sp, s0, -128
@@ -639,9 +639,9 @@ define i64 @explode_2xi64(<2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -666,28 +666,29 @@ define i64 @explode_4xi64(<4 x i64> %v) {
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.s.x v12, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vredxor.vs v12, v8, v12
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vsrl.vx v10, v10, a0
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vmv.x.s a4, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vsrl.vx v8, v12, a0
+; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vmv.x.s a5, v8
-; RV32-NEXT:    add a2, a5, a2
-; RV32-NEXT:    sltu a5, a2, a5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a5
-; RV32-NEXT:    add a1, a0, a3
-; RV32-NEXT:    add a0, a2, a4
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a6, a1, a0
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    add a3, a3, a6
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    sltu a1, a0, a1
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi64:
@@ -721,59 +722,60 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 3
 ; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a3, v16
-; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vsrl.vx v12, v12, a0
+; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a5, v16
+; RV32-NEXT:    vmv.x.s a4, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
+; RV32-NEXT:    vmv.x.s a3, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vsrl.vx v12, v12, a0
 ; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a7, v16
-; RV32-NEXT:    vmv.x.s t0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s t1, v16
-; RV32-NEXT:    vmv.x.s t2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s t3, v16
-; RV32-NEXT:    vmv.x.s t4, v12
-; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.x.s a7, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
+; RV32-NEXT:    vmv.x.s t0, v16
+; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vredxor.vs v16, v8, v16
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vmv.x.s t1, v12
+; RV32-NEXT:    vsrl.vx v12, v12, a0
+; RV32-NEXT:    vmv.x.s t2, v8
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s t3, v12
+; RV32-NEXT:    vmv.x.s t4, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vsrl.vx v8, v16, a0
+; RV32-NEXT:    vmv.x.s a0, v16
 ; RV32-NEXT:    vmv.x.s t5, v8
-; RV32-NEXT:    add a2, t5, a2
-; RV32-NEXT:    sltu t5, a2, t5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t5
-; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a1, t5, a1
 ; RV32-NEXT:    add a4, a2, a4
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    sltu a1, a4, a2
-; RV32-NEXT:    add a1, a1, a5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a1, a6, a4
-; RV32-NEXT:    add a1, a1, a7
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a1, t0, a6
-; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a5, a4, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a1, a1, a6
+; RV32-NEXT:    sltu a2, a5, a4
+; RV32-NEXT:    add a7, a5, a7
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t2, t0, t2
-; RV32-NEXT:    sltu a1, t2, t0
+; RV32-NEXT:    add a2, a2, t0
+; RV32-NEXT:    sltu a1, a7, a5
+; RV32-NEXT:    add t1, a7, t1
+; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    add a1, a1, t3
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    add a0, t2, t4
-; RV32-NEXT:    sltu a2, a0, t2
+; RV32-NEXT:    sltu a3, t1, a7
+; RV32-NEXT:    add a0, t1, t2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a3, a3, t4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    sltu a2, a0, t1
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    ret
 ;
@@ -792,21 +794,21 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.s.x v12, zero
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 32(sp)
+; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v12
+; RV64-NEXT:    ld a1, 32(sp)
 ; RV64-NEXT:    ld a3, 40(sp)
 ; RV64-NEXT:    ld a4, 48(sp)
 ; RV64-NEXT:    ld a5, 56(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s a6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a6, a0
 ; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a0, a6, a0
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    add a0, a0, a3
 ; RV64-NEXT:    add a0, a0, a5
@@ -840,20 +842,20 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-LABEL: explode_16xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    .cfi_def_cfa_offset 64
+; RV32-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    .cfi_offset s2, -12
@@ -866,121 +868,129 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_offset s9, -40
 ; RV32-NEXT:    .cfi_offset s10, -44
 ; RV32-NEXT:    .cfi_offset s11, -48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 8 * vlenb
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v8, 2
+; RV32-NEXT:    vslidedown.vi v24, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 3
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a3, v24
-; RV32-NEXT:    vmv.x.s a4, v16
+; RV32-NEXT:    vslidedown.vi v0, v8, 3
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s a2, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 5
+; RV32-NEXT:    vmv.x.s a3, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s a4, v0
+; RV32-NEXT:    vslidedown.vi v0, v8, 6
+; RV32-NEXT:    vmv.x.s a5, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s a6, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 5
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a7, v24
-; RV32-NEXT:    vmv.x.s t0, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 6
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t1, v24
-; RV32-NEXT:    vmv.x.s t2, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 7
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t3, v24
+; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s t0, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 8
+; RV32-NEXT:    vmv.x.s t1, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s t2, v0
+; RV32-NEXT:    vslidedown.vi v0, v8, 9
+; RV32-NEXT:    vmv.x.s t3, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s t4, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 8
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t5, v24
-; RV32-NEXT:    vmv.x.s t6, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 9
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s0, v24
-; RV32-NEXT:    vmv.x.s s1, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 10
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s2, v24
+; RV32-NEXT:    vmv.x.s t5, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 11
+; RV32-NEXT:    vmv.x.s s0, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s s1, v0
+; RV32-NEXT:    vslidedown.vi v0, v8, 12
+; RV32-NEXT:    vmv.x.s s2, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s s3, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 11
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s4, v24
-; RV32-NEXT:    vmv.x.s s5, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 12
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s6, v24
-; RV32-NEXT:    vmv.x.s s7, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 13
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s9, v24
-; RV32-NEXT:    vmv.x.s s8, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 14
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.s.x v17, zero
+; RV32-NEXT:    addi s4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (s4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.x.s s4, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s s5, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 14
+; RV32-NEXT:    vmv.x.s s6, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s s7, v0
+; RV32-NEXT:    vmv.s.x v7, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v17, v8, v17
+; RV32-NEXT:    vredxor.vs v16, v8, v7
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 15
+; RV32-NEXT:    addi s8, sp, 16
+; RV32-NEXT:    vl8r.v v0, (s8) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv.x.s s8, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s s9, v0
+; RV32-NEXT:    vsrl.vx v0, v24, a0
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v18, v17, a0
-; RV32-NEXT:    vmv.x.s s10, v18
+; RV32-NEXT:    vsrl.vx v17, v16, a0
+; RV32-NEXT:    vmv.x.s s10, v16
 ; RV32-NEXT:    vmv.x.s s11, v17
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a0
-; RV32-NEXT:    add a1, s10, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a0
 ; RV32-NEXT:    add a2, s11, a2
-; RV32-NEXT:    sltu a0, a2, s11
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a4, a2, a4
-; RV32-NEXT:    sltu a1, a4, a2
-; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    add a1, s10, a1
+; RV32-NEXT:    sltu a0, a1, s10
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a3, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
+; RV32-NEXT:    add a1, a1, a6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a1, a6, a4
-; RV32-NEXT:    add a1, a1, a7
+; RV32-NEXT:    add a5, a3, a5
+; RV32-NEXT:    sltu a1, a5, a3
+; RV32-NEXT:    add a1, a1, t0
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a1, t0, a6
-; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a7, a5, a7
+; RV32-NEXT:    sltu a1, a7, a5
+; RV32-NEXT:    add a1, a1, t2
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t2, t0, t2
-; RV32-NEXT:    sltu a1, t2, t0
-; RV32-NEXT:    add a1, a1, t3
+; RV32-NEXT:    add t1, a7, t1
+; RV32-NEXT:    sltu a1, t1, a7
+; RV32-NEXT:    add a1, a1, t4
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t4, t2, t4
-; RV32-NEXT:    sltu a1, t4, t2
-; RV32-NEXT:    add a1, a1, t5
+; RV32-NEXT:    add t3, t1, t3
+; RV32-NEXT:    sltu a1, t3, t1
+; RV32-NEXT:    add a1, a1, t6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t6, t4, t6
-; RV32-NEXT:    sltu a1, t6, t4
-; RV32-NEXT:    add a1, a1, s0
+; RV32-NEXT:    add t5, t3, t5
+; RV32-NEXT:    sltu a1, t5, t3
+; RV32-NEXT:    add a1, a1, s1
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s1, t6, s1
-; RV32-NEXT:    sltu a1, s1, t6
-; RV32-NEXT:    add a1, a1, s2
+; RV32-NEXT:    add s0, t5, s0
+; RV32-NEXT:    sltu a1, s0, t5
+; RV32-NEXT:    add a1, a1, s3
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s3, s1, s3
-; RV32-NEXT:    sltu a1, s3, s1
-; RV32-NEXT:    add a1, a1, s4
+; RV32-NEXT:    add s2, s0, s2
+; RV32-NEXT:    sltu a1, s2, s0
+; RV32-NEXT:    add a1, a1, s5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s5, s3, s5
-; RV32-NEXT:    sltu a1, s5, s3
-; RV32-NEXT:    add a1, a1, s6
+; RV32-NEXT:    add s4, s2, s4
+; RV32-NEXT:    sltu a1, s4, s2
+; RV32-NEXT:    add a1, a1, s7
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s7, s5, s7
-; RV32-NEXT:    sltu a1, s7, s5
+; RV32-NEXT:    add s6, s4, s6
+; RV32-NEXT:    sltu a1, s6, s4
 ; RV32-NEXT:    add a1, a1, s9
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    add s8, s7, s8
-; RV32-NEXT:    sltu a2, s8, s7
+; RV32-NEXT:    vmv.x.s a1, v0
+; RV32-NEXT:    add s8, s6, s8
+; RV32-NEXT:    sltu a2, s8, s6
 ; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    vmv.x.s a2, v16
+; RV32-NEXT:    vmv.x.s a2, v24
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vmv.x.s a1, v0
+; RV32-NEXT:    vmv.x.s a1, v16
 ; RV32-NEXT:    add a2, s8, a2
 ; RV32-NEXT:    sltu a3, a2, s8
 ; RV32-NEXT:    add a1, a3, a1
@@ -989,18 +999,22 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    add a0, a2, a0
 ; RV32-NEXT:    sltu a2, a0, a2
 ; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add sp, sp, a2
+; RV32-NEXT:    .cfi_def_cfa sp, 64
+; RV32-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 16(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -1013,7 +1027,7 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_restore s9
 ; RV32-NEXT:    .cfi_restore s10
 ; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 64
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1032,11 +1046,14 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    vslidedown.vi v16, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v16
 ; RV64-NEXT:    vslidedown.vi v16, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v16
-; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vmv.x.s a2, v16
+; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 32(sp)
+; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v16
+; RV64-NEXT:    ld a1, 32(sp)
 ; RV64-NEXT:    ld a3, 40(sp)
 ; RV64-NEXT:    ld a4, 48(sp)
 ; RV64-NEXT:    ld a5, 56(sp)
@@ -1048,20 +1065,17 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    ld t3, 104(sp)
 ; RV64-NEXT:    ld t4, 112(sp)
 ; RV64-NEXT:    ld t5, 120(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s t6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
 ; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a0, a0, a3
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, a5, a7
-; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    add t0, t0, t1
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a5, a5, a7
 ; RV64-NEXT:    add t0, t0, t2
+; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    add t0, t0, t3
 ; RV64-NEXT:    add a0, a0, t0
 ; RV64-NEXT:    add t4, t4, t5
@@ -1116,22 +1130,22 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vmv.x.s a2, v12
 ; RV32-NEXT:    vslidedown.vi v12, v9, 1
 ; RV32-NEXT:    vmv.x.s a3, v12
 ; RV32-NEXT:    vslidedown.vi v12, v9, 2
 ; RV32-NEXT:    vmv.x.s a4, v12
 ; RV32-NEXT:    vslidedown.vi v9, v9, 3
-; RV32-NEXT:    vmv.x.s a5, v9
-; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a6, v9
 ; RV32-NEXT:    vslidedown.vi v9, v10, 1
 ; RV32-NEXT:    vmv.x.s a7, v9
 ; RV32-NEXT:    vslidedown.vi v9, v10, 2
 ; RV32-NEXT:    vmv.x.s t0, v9
 ; RV32-NEXT:    vslidedown.vi v9, v10, 3
-; RV32-NEXT:    vmv.x.s t1, v9
-; RV32-NEXT:    vmv.x.s t2, v11
+; RV32-NEXT:    vmv.x.s t1, v11
+; RV32-NEXT:    vmv.x.s t2, v9
 ; RV32-NEXT:    vslidedown.vi v9, v11, 1
 ; RV32-NEXT:    vmv.x.s t3, v9
 ; RV32-NEXT:    vslidedown.vi v9, v11, 2
@@ -1142,18 +1156,18 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s t6, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, t6, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    add t1, t2, t1
+; RV32-NEXT:    add a0, t6, a0
+; RV32-NEXT:    add a1, a1, a4
 ; RV32-NEXT:    add a5, a5, a7
-; RV32-NEXT:    add a5, a5, t0
-; RV32-NEXT:    add a0, a0, a5
-; RV32-NEXT:    add t1, t1, t2
 ; RV32-NEXT:    add t1, t1, t3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, a5, t0
 ; RV32-NEXT:    add t1, t1, t4
+; RV32-NEXT:    add a0, a0, a5
 ; RV32-NEXT:    add t1, t1, t5
 ; RV32-NEXT:    add a0, a0, t1
 ; RV32-NEXT:    ret
@@ -1164,22 +1178,22 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v9, 1
 ; RV64-NEXT:    vmv.x.s a3, v12
 ; RV64-NEXT:    vslidedown.vi v12, v9, 2
 ; RV64-NEXT:    vmv.x.s a4, v12
 ; RV64-NEXT:    vslidedown.vi v9, v9, 3
-; RV64-NEXT:    vmv.x.s a5, v9
-; RV64-NEXT:    vmv.x.s a6, v10
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vmv.x.s a6, v9
 ; RV64-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64-NEXT:    vmv.x.s a7, v9
 ; RV64-NEXT:    vslidedown.vi v9, v10, 2
 ; RV64-NEXT:    vmv.x.s t0, v9
 ; RV64-NEXT:    vslidedown.vi v9, v10, 3
-; RV64-NEXT:    vmv.x.s t1, v9
-; RV64-NEXT:    vmv.x.s t2, v11
+; RV64-NEXT:    vmv.x.s t1, v11
+; RV64-NEXT:    vmv.x.s t2, v9
 ; RV64-NEXT:    vslidedown.vi v9, v11, 1
 ; RV64-NEXT:    vmv.x.s t3, v9
 ; RV64-NEXT:    vslidedown.vi v9, v11, 2
@@ -1190,18 +1204,18 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s t6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    add t1, t2, t1
+; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    add a1, a1, a4
 ; RV64-NEXT:    add a5, a5, a7
-; RV64-NEXT:    add a5, a5, t0
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    add t1, t1, t2
 ; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a5, a5, t0
 ; RV64-NEXT:    add t1, t1, t4
+; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    add t1, t1, t5
 ; RV64-NEXT:    addw a0, a0, t1
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index c65e7aec712ae4..544602951e10d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -53,13 +53,11 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; V128:       # %bb.0:
 ; V128-NEXT:    vmv1r.v v12, v9
 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; V128-NEXT:    vid.v v9
-; V128-NEXT:    vsrl.vi v14, v9, 1
-; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; V128-NEXT:    vrgatherei16.vv v10, v8, v14
-; V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; V128-NEXT:    vid.v v10
 ; V128-NEXT:    vmv.v.i v0, 10
-; V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; V128-NEXT:    vsrl.vi v14, v10, 1
+; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; V128-NEXT:    vrgatherei16.vv v10, v8, v14
 ; V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
@@ -191,10 +189,12 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V128-NEXT:    vwaddu.vv v10, v8, v8
 ; V128-NEXT:    li a0, -1
+; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; V128-NEXT:    vid.v v11
+; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V128-NEXT:    vwmaccu.vx v10, a0, v8
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT:    vid.v v8
-; V128-NEXT:    vsrl.vi v8, v8, 1
+; V128-NEXT:    vsrl.vi v8, v11, 1
 ; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vadd.vi v8, v8, 1
 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
@@ -206,10 +206,12 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V512-NEXT:    vwaddu.vv v10, v8, v8
 ; V512-NEXT:    li a0, -1
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vid.v v11
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V512-NEXT:    vwmaccu.vx v10, a0, v8
 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT:    vid.v v8
-; V512-NEXT:    vsrl.vi v8, v8, 1
+; V512-NEXT:    vsrl.vi v8, v11, 1
 ; V512-NEXT:    vmv.v.i v0, 10
 ; V512-NEXT:    vadd.vi v8, v8, 1
 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
@@ -409,26 +411,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-NEXT:    slli a0, a0, 3
 ; V128-NEXT:    sub sp, sp, a0
 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT:    vmv8r.v v0, v16
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v24, v16
 ; V128-NEXT:    vmv8r.v v16, v8
+; V128-NEXT:    vmv8r.v v8, v24
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v8, v0, 16
-; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    vslidedown.vi v0, v24, 16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v24, v8, v0
+; V128-NEXT:    vwmaccu.vx v24, a0, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; V128-NEXT:    vslidedown.vi v0, v16, 16
+; V128-NEXT:    lui a1, 699051
+; V128-NEXT:    li a2, 32
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v8, v0, v16
-; V128-NEXT:    vwmaccu.vx v8, a0, v16
-; V128-NEXT:    lui a1, 699051
 ; V128-NEXT:    addi a1, a1, -1366
 ; V128-NEXT:    vmv.s.x v0, a1
-; V128-NEXT:    li a1, 32
-; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; V128-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; V128-NEXT:    addi a1, sp, 16
 ; V128-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 2f73e3c7a2be6c..e46587f58b4eb6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -100,8 +100,8 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vrsub.vi v10, v9, 4
 ; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vrsub.vi v10, v9, 4
 ; CHECK-NEXT:    vmv.v.i v9, 5
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -116,8 +116,8 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 3
-; CHECK-NEXT:    vmul.vx v10, v9, a0
 ; CHECK-NEXT:    vmv.v.i v0, 3
+; CHECK-NEXT:    vmul.vx v10, v9, a0
 ; CHECK-NEXT:    vmv.v.i v9, 5
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -162,9 +162,9 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    vmv.v.i v21, 2
+; RV32-NEXT:    li a0, 164
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v20
-; RV32-NEXT:    li a0, 164
 ; RV32-NEXT:    vmv.s.x v0, a0
 ; RV32-NEXT:    li a0, 5
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -176,21 +176,21 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ;
 ; RV64-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 164
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    lui a0, 327683
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    addi a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 17
 ; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v20, a0
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v20
-; RV64-NEXT:    li a0, 164
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    lui a0, 163841
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    addi a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 17
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v20
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
@@ -207,15 +207,15 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_0)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV32-NEXT:    vle16.v v16, (a0)
-; RV32-NEXT:    vmv.v.i v20, -1
+; RV32-NEXT:    vmv.v.i v16, -1
+; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_1)
-; RV32-NEXT:    vle16.v v17, (a0)
+; RV32-NEXT:    vle16.v v21, (a0)
 ; RV32-NEXT:    li a0, 113
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vrgatherei16.vv v12, v20, v16
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v17, v0.t
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v20
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v21, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -243,14 +243,14 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_0)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV32-NEXT:    vle16.v v16, (a0)
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
 ; RV32-NEXT:    lui a0, %hi(.LCPI13_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_1)
-; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    vle16.v v17, (a0)
 ; RV32-NEXT:    li a0, 140
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vmv.v.i v16, 5
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vmv.v.i v8, 5
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v17, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -437,9 +437,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 4
 ; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    li a0, 70
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 2
-; CHECK-NEXT:    li a0, 70
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
@@ -456,13 +456,13 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 6
 ; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 5
 ; CHECK-NEXT:    lui a0, 8256
 ; CHECK-NEXT:    addi a0, a0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 98
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v11, v10, 5
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
@@ -724,17 +724,18 @@ define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
 ; CHECK-LABEL: shuffle_v64i8_v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    li a1, 240
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 98561
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vsll.vi v14, v12, 3
 ; CHECK-NEXT:    vrgather.vv v12, v8, v14
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    li a1, 240
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    lui a1, 98561
 ; CHECK-NEXT:    addi a1, a1, -2048
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v12, v8, v10, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index f894691b993e47..cba8de82ec41b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1045,47 +1045,47 @@ define void @urem_v2i64(ptr %x, ptr %y) {
 define void @mulhu_v16i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    addi a1, a1, -2044
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a2, a1, 32
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a2
 ; CHECK-NEXT:    lui a2, %hi(.LCPI65_0)
 ; CHECK-NEXT:    addi a2, a2, %lo(.LCPI65_0)
 ; CHECK-NEXT:    vle8.v v11, (a2)
-; CHECK-NEXT:    li a2, -128
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a1, -2044
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    addi a1, a2, 32
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    li a1, -128
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vxm v12, v10, a2, v0
+; CHECK-NEXT:    vmerge.vxm v12, v10, a1, v0
+; CHECK-NEXT:    li a1, 513
+; CHECK-NEXT:    vmv.v.i v13, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsrl.vv v8, v9, v8
-; CHECK-NEXT:    vmulhu.vv v8, v8, v11
-; CHECK-NEXT:    vsub.vv v9, v9, v8
-; CHECK-NEXT:    vmulhu.vv v9, v9, v12
-; CHECK-NEXT:    vadd.vv v9, v9, v8
-; CHECK-NEXT:    li a2, 513
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    addi a1, a2, 78
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 4
-; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
-; CHECK-NEXT:    addi a1, a1, 78
+; CHECK-NEXT:    vmerge.vim v10, v13, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsrl.vv v8, v9, v8
+; CHECK-NEXT:    vmulhu.vv v8, v8, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 3, v0
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a1, a1, 304
-; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vsub.vv v9, v9, v8
+; CHECK-NEXT:    vmulhu.vv v9, v9, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 3, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 2, v0
-; CHECK-NEXT:    vsrl.vv v8, v9, v8
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vmerge.vim v9, v10, 2, v0
+; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <16 x i8>, ptr %x
@@ -1100,31 +1100,31 @@ define void @mulhu_v8i16(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v10, a1
+; CHECK-NEXT:    vmv.v.i v10, 1
+; CHECK-NEXT:    li a1, 33
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, %hi(.LCPI66_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI66_0)
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v11, (a1)
-; CHECK-NEXT:    vmv.v.i v12, 1
+; CHECK-NEXT:    vmv.v.i v11, 3
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vmerge.vim v11, v11, 2, v0
+; CHECK-NEXT:    vmv.v.i v13, 0
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v9, v12, 6
+; CHECK-NEXT:    vslideup.vi v9, v10, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-NEXT:    vmulhu.vv v9, v9, v11
+; CHECK-NEXT:    vmulhu.vv v9, v9, v12
+; CHECK-NEXT:    lui a1, 1048568
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v13, a1
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v13
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    li a1, 33
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v9, v12, 6
+; CHECK-NEXT:    vslideup.vi v11, v10, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v11
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x i16>, ptr %x
@@ -1157,22 +1157,22 @@ define void @mulhu_v4i32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI68_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI68_0)
-; CHECK-NEXT:    vle32.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    lui a1, %hi(.LCPI68_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI68_0)
+; CHECK-NEXT:    vle32.v v11, (a1)
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 2
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v9, v8, v9
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vmulhu.vv v8, v8, v11
-; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vslideup.vi v9, v10, 2
 ; CHECK-NEXT:    lui a1, 4128
 ; CHECK-NEXT:    addi a1, a1, 514
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vv v10, v8, v11
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsext.vf4 v10, v9
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -1192,10 +1192,10 @@ define void @mulhu_v2i64(ptr %x) {
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI69_0)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a1)
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmulhu.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 32
 ; RV32-NEXT:    addi a1, a1, 1
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vmulhu.vv v8, v8, v9
 ; RV32-NEXT:    vmv.s.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vsext.vf4 v10, v9
@@ -1209,16 +1209,16 @@ define void @mulhu_v2i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 838861
+; RV64-NEXT:    lui a2, 699051
 ; RV64-NEXT:    addiw a1, a1, -819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    addiw a2, a2, -1365
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    vmv.v.x v9, a1
-; RV64-NEXT:    lui a1, 699051
-; RV64-NEXT:    addiw a1, a1, -1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; RV64-NEXT:    vmv.s.x v9, a1
+; RV64-NEXT:    vmv.s.x v9, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vmulhu.vv v8, v8, v9
 ; RV64-NEXT:    vid.v v9
@@ -1246,9 +1246,9 @@ define void @mulhs_v16i8(ptr %x) {
 ; CHECK-NEXT:    li a1, 57
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vxm v9, v9, a1, v0
+; CHECK-NEXT:    vmv.v.i v10, 7
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v9
-; CHECK-NEXT:    vmv.v.i v9, 7
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -1263,11 +1263,11 @@ define void @mulhs_v8i16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    li a1, 105
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, -1755
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    li a1, 105
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 1048571
 ; CHECK-NEXT:    addi a1, a1, 1755
 ; CHECK-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -1309,9 +1309,9 @@ define void @mulhs_v4i32(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    lui a1, 419430
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    addi a1, a1, 1639
 ; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    lui a1, 629146
 ; RV32-NEXT:    addi a1, a1, -1639
 ; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -1349,28 +1349,27 @@ define void @mulhs_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a2, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a2
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    addi a2, a1, 1365
+; RV32-NEXT:    vmv.v.x v10, a2
+; RV32-NEXT:    li a2, 63
 ; RV32-NEXT:    addi a1, a1, 1366
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
-; RV32-NEXT:    vmv.s.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmulh.vv v9, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    vsrl.vi v10, v10, 1
-; RV32-NEXT:    vrsub.vi v10, v10, 0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmadd.vv v10, v8, v9
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsrl.vx v8, v10, a1
+; RV32-NEXT:    vmv.s.x v10, a1
 ; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    vmv.s.x v9, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v9, 1
+; RV32-NEXT:    vrsub.vi v9, v9, 0
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vmulh.vv v10, v8, v10
+; RV32-NEXT:    vmadd.vv v9, v8, v10
+; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vsext.vf4 v11, v9
+; RV32-NEXT:    vsext.vf4 v10, v8
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsra.vv v9, v10, v11
+; RV32-NEXT:    vsrl.vx v8, v9, a2
+; RV32-NEXT:    vsra.vv v9, v9, v10
 ; RV32-NEXT:    vadd.vv v8, v9, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
@@ -1381,21 +1380,21 @@ define void @mulhs_v2i64(ptr %x) {
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
 ; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    lui a2, %hi(.LCPI74_0)
+; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    ld a2, %lo(.LCPI74_0)(a2)
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    vrsub.vi v11, v9, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; RV64-NEXT:    vmv.s.x v9, a2
+; RV64-NEXT:    vmv.s.x v10, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64-NEXT:    vmulh.vv v9, v8, v9
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    vrsub.vi v11, v10, 0
-; RV64-NEXT:    vmadd.vv v11, v8, v9
-; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    vmulh.vv v10, v8, v10
+; RV64-NEXT:    vmadd.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vx v8, v11, a1
-; RV64-NEXT:    vsra.vv v9, v11, v10
+; RV64-NEXT:    vsra.vv v9, v11, v9
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
@@ -3156,47 +3155,47 @@ define void @mulhu_v32i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    lui a2, 163907
+; CHECK-NEXT:    addi a2, a2, -2044
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    lui a2, 66049
+; CHECK-NEXT:    addi a2, a2, 32
+; CHECK-NEXT:    vmv.s.x v8, a2
+; CHECK-NEXT:    li a2, -128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    lui a1, 163907
-; CHECK-NEXT:    addi a1, a1, -2044
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    lui a1, 66049
-; CHECK-NEXT:    addi a1, a1, 32
-; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vmerge.vxm v10, v12, a2, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI181_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI181_0)
-; CHECK-NEXT:    vle8.v v14, (a1)
-; CHECK-NEXT:    li a1, -128
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vxm v16, v12, a1, v0
+; CHECK-NEXT:    vle8.v v14, (a0)
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT:    vsrl.vv v8, v10, v8
-; CHECK-NEXT:    vmulhu.vv v8, v8, v14
-; CHECK-NEXT:    vsub.vv v10, v10, v8
-; CHECK-NEXT:    vmulhu.vv v10, v10, v16
-; CHECK-NEXT:    vadd.vv v10, v10, v8
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    lui a1, 8208
 ; CHECK-NEXT:    addi a1, a1, 513
+; CHECK-NEXT:    vsrl.vv v8, v14, v8
+; CHECK-NEXT:    vmulhu.vv v12, v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 4
-; CHECK-NEXT:    vmerge.vim v12, v8, 1, v0
 ; CHECK-NEXT:    lui a1, 66785
 ; CHECK-NEXT:    addi a1, a1, 78
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    lui a1, 529160
-; CHECK-NEXT:    addi a1, a1, 304
 ; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    lui a1, 529160
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v12, v12, 3, v0
+; CHECK-NEXT:    vsub.vv v14, v14, v12
+; CHECK-NEXT:    vmulhu.vv v10, v14, v10
+; CHECK-NEXT:    vmv.v.i v14, 4
+; CHECK-NEXT:    addi a1, a1, 304
+; CHECK-NEXT:    vmerge.vim v14, v14, 1, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a1
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v12, 2, v0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v14, v14, 3, v0
+; CHECK-NEXT:    vadd.vv v10, v10, v12
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v14, 2, v0
 ; CHECK-NEXT:    vsrl.vv v8, v10, v8
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -3212,36 +3211,37 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a0)
 ; RV32-NEXT:    li a1, 257
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 1048568
 ; RV32-NEXT:    vmerge.vxm v12, v8, a1, v0
 ; RV32-NEXT:    lui a1, 4
+; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v14, 0
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v8, a1
-; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    lui a1, 2
+; RV32-NEXT:    addi a1, a1, 289
+; RV32-NEXT:    vmv.s.x v9, a1
 ; RV32-NEXT:    lui a1, %hi(.LCPI182_0)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI182_0)
-; RV32-NEXT:    vle16.v v14, (a1)
+; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v15, 3
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vmerge.vim v14, v14, 1, v0
+; RV32-NEXT:    vmv1r.v v0, v9
+; RV32-NEXT:    vmerge.vim v9, v15, 2, v0
+; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vmerge.vim v8, v9, 1, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v9
-; RV32-NEXT:    vsrl.vv v16, v10, v16
-; RV32-NEXT:    vmulhu.vv v14, v16, v14
+; RV32-NEXT:    vsext.vf2 v18, v14
+; RV32-NEXT:    vsrl.vv v14, v10, v18
+; RV32-NEXT:    vmulhu.vv v14, v14, v16
 ; RV32-NEXT:    vsub.vv v10, v10, v14
 ; RV32-NEXT:    vmulhu.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v10, v10, v14
-; RV32-NEXT:    lui a1, 2
-; RV32-NEXT:    addi a1, a1, 289
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 3
-; RV32-NEXT:    vmerge.vim v9, v9, 2, v0
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v9, 1, v0
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    vsext.vf2 v12, v8
 ; RV32-NEXT:    vsrl.vv v8, v10, v12
 ; RV32-NEXT:    vse16.v v8, (a0)
@@ -3252,31 +3252,31 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v8, (a0)
 ; RV64-NEXT:    li a1, 257
-; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    lui a1, 1048568
-; RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    lui a1, %hi(.LCPI182_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI182_0)
 ; RV64-NEXT:    vle16.v v12, (a1)
+; RV64-NEXT:    lui a1, 1048568
+; RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v14, a1
+; RV64-NEXT:    lui a1, %hi(.LCPI182_1)
+; RV64-NEXT:    ld a1, %lo(.LCPI182_1)(a1)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vsext.vf2 v16, v14
 ; RV64-NEXT:    vsrl.vv v14, v8, v16
 ; RV64-NEXT:    vmulhu.vv v12, v14, v12
-; RV64-NEXT:    lui a1, %hi(.LCPI182_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI182_1)(a1)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v14, a1
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vsub.vv v8, v8, v12
 ; RV64-NEXT:    vmulhu.vv v8, v8, v10
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v10
-; RV64-NEXT:    vsrl.vv v8, v8, v12
+; RV64-NEXT:    vsext.vf2 v10, v14
+; RV64-NEXT:    vsrl.vv v8, v8, v10
 ; RV64-NEXT:    vse16.v v8, (a0)
 ; RV64-NEXT:    ret
   %a = load <16 x i16>, ptr %x
@@ -3291,22 +3291,22 @@ define void @mulhu_v8i32(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    li a1, 68
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, %hi(.LCPI183_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI183_0)
-; CHECK-NEXT:    vle32.v v10, (a1)
-; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vle32.v v12, (a1)
 ; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    vmerge.vxm v12, v12, a1, v0
-; CHECK-NEXT:    vmulhu.vv v10, v8, v10
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vmulhu.vv v8, v8, v12
-; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; CHECK-NEXT:    lui a1, 4128
 ; CHECK-NEXT:    addi a1, a1, 514
+; CHECK-NEXT:    vmulhu.vv v12, v8, v12
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    vsext.vf4 v12, v10
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -3326,24 +3326,22 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a1)
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmulhu.vv v10, v8, v10
 ; RV32-NEXT:    lui a1, 524288
-; RV32-NEXT:    vmv.s.x v12, a1
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v14, 0
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; RV32-NEXT:    vslideup.vi v14, v12, 5
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmv.s.x v14, a1
 ; RV32-NEXT:    lui a1, %hi(.LCPI184_1)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI184_1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v12, v14, 5
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vle8.v v12, (a1)
+; RV32-NEXT:    vle8.v v14, (a1)
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vmulhu.vv v10, v8, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vmulhu.vv v8, v8, v14
+; RV32-NEXT:    vmulhu.vv v8, v8, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v10, v12
+; RV32-NEXT:    vsext.vf4 v10, v14
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vv v8, v8, v10
 ; RV32-NEXT:    vse64.v v8, (a0)
@@ -3356,22 +3354,22 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    lui a1, %hi(.LCPI184_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV64-NEXT:    vle64.v v10, (a1)
-; RV64-NEXT:    vmulhu.vv v10, v8, v10
-; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmv.v.i v12, 0
 ; RV64-NEXT:    slli a1, a1, 63
-; RV64-NEXT:    vmv.s.x v12, a1
-; RV64-NEXT:    vmv.v.i v14, 0
+; RV64-NEXT:    vmv.s.x v14, a1
+; RV64-NEXT:    lui a1, 12320
+; RV64-NEXT:    addi a1, a1, 513
 ; RV64-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
-; RV64-NEXT:    vslideup.vi v14, v12, 2
+; RV64-NEXT:    vslideup.vi v12, v14, 2
+; RV64-NEXT:    vmv.s.x v14, a1
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmulhu.vv v8, v8, v14
+; RV64-NEXT:    vmulhu.vv v10, v8, v10
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vmulhu.vv v8, v8, v12
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 12320
-; RV64-NEXT:    addi a1, a1, 513
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    vsext.vf8 v12, v10
-; RV64-NEXT:    vsrl.vv v8, v8, v12
+; RV64-NEXT:    vsext.vf8 v10, v14
+; RV64-NEXT:    vsrl.vv v8, v8, v10
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   %a = load <4 x i64>, ptr %x
@@ -3384,16 +3382,16 @@ define void @mulhs_v32i8(ptr %x) {
 ; CHECK-LABEL: mulhs_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    lui a2, 304453
+; CHECK-NEXT:    addi a2, a2, -1452
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    li a2, -123
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    lui a1, 304453
-; CHECK-NEXT:    addi a1, a1, -1452
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 7
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT:    li a1, -123
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v12, a2
 ; CHECK-NEXT:    li a1, 57
 ; CHECK-NEXT:    vmerge.vxm v12, v12, a1, v0
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v12
@@ -3437,11 +3435,11 @@ define void @mulhs_v8i32(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    li a1, 85
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 419430
 ; RV32-NEXT:    addi a1, a1, 1639
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 85
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 629146
 ; RV32-NEXT:    addi a1, a1, -1639
 ; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
@@ -3479,63 +3477,61 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a2, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a2
 ; RV32-NEXT:    li a2, 17
 ; RV32-NEXT:    vmv.s.x v0, a2
-; RV32-NEXT:    addi a1, a1, 1366
-; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmulh.vv v10, v8, v10
-; RV32-NEXT:    lui a1, 1048560
+; RV32-NEXT:    lui a2, 1048560
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a2
+; RV32-NEXT:    addi a2, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v14, v12
+; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    li a2, 63
+; RV32-NEXT:    addi a1, a1, 1366
+; RV32-NEXT:    vmerge.vxm v12, v12, a1, v0
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    vsext.vf4 v14, v10
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vmulh.vv v10, v8, v12
 ; RV32-NEXT:    vmadd.vv v14, v8, v10
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsrl.vx v8, v14, a1
-; RV32-NEXT:    lui a1, 16
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v12, v10
+; RV32-NEXT:    vsext.vf4 v10, v8
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsra.vv v10, v14, v12
+; RV32-NEXT:    vsrl.vx v8, v14, a2
+; RV32-NEXT:    vsra.vv v10, v14, v10
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mulhs_v4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 1044496
 ; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addi a2, a2, -256
+; RV64-NEXT:    vmv.s.x v10, a2
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    lui a2, %hi(.LCPI188_0)
 ; RV64-NEXT:    ld a2, %lo(.LCPI188_0)(a2)
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; RV64-NEXT:    vmv.v.i v0, 5
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vmerge.vxm v10, v10, a2, v0
-; RV64-NEXT:    vmulh.vv v10, v8, v10
-; RV64-NEXT:    lui a1, 1044496
-; RV64-NEXT:    addi a1, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a1
-; RV64-NEXT:    vsext.vf8 v14, v12
-; RV64-NEXT:    vmadd.vv v14, v8, v10
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    vmerge.vxm v12, v12, a2, v0
+; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    addi a2, a2, 256
+; RV64-NEXT:    vsext.vf8 v14, v10
+; RV64-NEXT:    vmulh.vv v10, v8, v12
+; RV64-NEXT:    vmadd.vv v14, v8, v10
+; RV64-NEXT:    vmv.s.x v8, a2
+; RV64-NEXT:    vsext.vf8 v10, v8
 ; RV64-NEXT:    vsrl.vx v8, v14, a1
-; RV64-NEXT:    lui a1, 4096
-; RV64-NEXT:    addi a1, a1, 256
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    vsext.vf8 v12, v10
-; RV64-NEXT:    vsra.vv v10, v14, v12
+; RV64-NEXT:    vsra.vv v10, v14, v10
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
@@ -5632,12 +5628,12 @@ define void @mulhs_vx_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a2, a1, 1365
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a1, 1365
 ; RV32-NEXT:    addi a1, a1, 1366
 ; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a1), zero
+; RV32-NEXT:    sw a3, 12(sp)
+; RV32-NEXT:    vlse64.v v9, (a2), zero
 ; RV32-NEXT:    vmulh.vv v8, v8, v9
 ; RV32-NEXT:    li a1, 63
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index af46849ae08719..30e41f2f526e57 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -15,32 +15,37 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    ld a6, 56(a0)
 ; ZVE32X-NEXT:    ld a7, 72(a0)
 ; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vmv.s.x v9, zero
+; ZVE32X-NEXT:    vmv.v.i v10, 0
 ; ZVE32X-NEXT:    xor a3, a3, a4
+; ZVE32X-NEXT:    xor a1, a1, a2
+; ZVE32X-NEXT:    xor a2, a5, a6
+; ZVE32X-NEXT:    xor a0, a7, a0
 ; ZVE32X-NEXT:    snez a3, a3
-; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    snez a2, a2
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v8, a3
+; ZVE32X-NEXT:    vmv.s.x v11, a1
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
+; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
-; ZVE32X-NEXT:    vmv.s.x v9, zero
-; ZVE32X-NEXT:    vmerge.vim v8, v9, 1, v0
-; ZVE32X-NEXT:    xor a1, a1, a2
-; ZVE32X-NEXT:    snez a1, a1
-; ZVE32X-NEXT:    vmv.s.x v10, a1
-; ZVE32X-NEXT:    vand.vi v10, v10, 1
-; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
+; ZVE32X-NEXT:    vmsne.vi v8, v11, 0
+; ZVE32X-NEXT:    vmerge.vim v11, v9, 1, v0
+; ZVE32X-NEXT:    vmv1r.v v0, v8
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.v.i v10, 0
-; ZVE32X-NEXT:    vmerge.vim v11, v10, 1, v0
+; ZVE32X-NEXT:    vmerge.vim v8, v10, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 2, e8, mf4, tu, ma
-; ZVE32X-NEXT:    vslideup.vi v11, v8, 1
+; ZVE32X-NEXT:    vslideup.vi v8, v11, 1
+; ZVE32X-NEXT:    vmv.s.x v11, a2
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
-; ZVE32X-NEXT:    xor a1, a5, a6
-; ZVE32X-NEXT:    snez a1, a1
-; ZVE32X-NEXT:    vmv.s.x v8, a1
+; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vand.vi v8, v8, 1
-; ZVE32X-NEXT:    vmsne.vi v8, v8, 0
+; ZVE32X-NEXT:    vmsne.vi v8, v11, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmerge.vim v11, v10, 1, v0
 ; ZVE32X-NEXT:    vmv1r.v v0, v8
@@ -48,13 +53,12 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vmerge.vim v8, v9, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
 ; ZVE32X-NEXT:    vslideup.vi v11, v8, 2
-; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
-; ZVE32X-NEXT:    xor a0, a7, a0
-; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v8, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
+; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v8, v8, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmerge.vim v10, v10, 1, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index b56814ea4c372a..fa1377406d697b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -12,9 +12,9 @@ define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a0)
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wi v8, v10, 0
-; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vnsrl.wx v9, v10, a0
 ; RV32-NEXT:    ret
 ;
@@ -183,129 +183,107 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 84
+; RV32-NEXT:    li a3, 81
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb
-; RV32-NEXT:    addi a3, a1, 256
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 81 * vlenb
+; RV32-NEXT:    addi a3, a1, 128
+; RV32-NEXT:    addi a4, a1, 256
 ; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    lui a5, 12
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a3)
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 76
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v4, v8, 4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v4, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v0, a4
+; RV32-NEXT:    vle32.v v16, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    li a6, 57
+; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v4, v8, 10, v0.t
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, %hi(.LCPI8_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_0)
+; RV32-NEXT:    vmv.s.x v1, a5
+; RV32-NEXT:    lui a5, %hi(.LCPI8_1)
+; RV32-NEXT:    addi a5, a5, %lo(.LCPI8_1)
+; RV32-NEXT:    vle16.v v4, (a4)
+; RV32-NEXT:    lui a4, 1
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vslideup.vi v12, v16, 4
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 37
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs4r.v v12, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v16, 16
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 45
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vslideup.vi v12, v16, 10, v0.t
+; RV32-NEXT:    vmv.v.v v28, v12
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v0, (a4)
-; RV32-NEXT:    lui a4, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_1)
-; RV32-NEXT:    lui a5, 1
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 56
-; RV32-NEXT:    mul a4, a4, a6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v24, (a5)
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 68
-; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    slli a5, a1, 6
+; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v24, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a5, -64
-; RV32-NEXT:    vmv.s.x v16, a1
+; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v0
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a4, -64
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v16
+; RV32-NEXT:    vmv.v.v v28, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -318,7 +296,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -328,34 +306,34 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -363,13 +341,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -378,15 +356,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v2
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v8, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
@@ -398,22 +376,18 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v28, a1
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
+; RV32-NEXT:    vmv1r.v v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -423,70 +397,78 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_8)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    vle16.v v12, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 2
+; RV32-NEXT:    li a4, 13
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs4r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vrgatherei16.vv v20, v16, v8
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v24, v8
+; RV32-NEXT:    vslideup.vi v20, v8, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v20
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
+; RV32-NEXT:    vmv1r.v v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -497,21 +479,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v8, v24, 6
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v8, v16, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv4r.v v24, v16
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_12)
@@ -527,21 +508,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -560,7 +542,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -568,13 +550,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -582,8 +564,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -595,7 +577,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 60
+; RV32-NEXT:    li a2, 73
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -604,26 +586,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    li a2, 21
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 13
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 76
+; RV32-NEXT:    li a2, 57
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a2, a1, 2
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -640,21 +624,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 36
+; RV32-NEXT:    li a2, 41
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 84
+; RV32-NEXT:    li a1, 81
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -667,141 +651,130 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 6
-; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a2, a2, 6
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb
-; RV64-NEXT:    addi a2, a1, 256
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 21
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
 ; RV64-NEXT:    addi a2, a1, 128
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 48
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, a1, 256
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 57
+; RV64-NEXT:    li a3, 20
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    li a1, 128
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    vmv.s.x v1, a1
+; RV64-NEXT:    li a1, 6
+; RV64-NEXT:    vmul.vx v2, v10, a1
+; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgather.vi v12, v16, 4
-; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v16, 8
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 37
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 36
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v12, v16, 2, v0.t
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    li a1, 6
-; RV64-NEXT:    vmul.vx v8, v10, a1
-; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vle64.v v24, (a2)
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 45
+; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vadd.vi v10, v8, -16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v7, a1
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v10, v2, -16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v0, v8
-; RV64-NEXT:    vmv2r.v v4, v8
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v2
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v6, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv1r.v v0, v6
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v10, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v10, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v16
+; RV64-NEXT:    vmv.v.v v12, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    li a2, 20
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v8, 5
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v12, v16, 5
 ; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
+; RV64-NEXT:    li a2, 36
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vrgather.vi v12, v16, 3, v0.t
-; RV64-NEXT:    vmv.v.v v28, v12
+; RV64-NEXT:    vmv.v.v v20, v12
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v4, 1
-; RV64-NEXT:    vadd.vi v26, v4, -15
+; RV64-NEXT:    vadd.vi v16, v2, 1
+; RV64-NEXT:    vadd.vi v18, v2, -15
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v16
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v26, v0.t
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v18, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v28, v16
+; RV64-NEXT:    vmv.v.v v20, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    li a2, 12
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 16
 ; RV64-NEXT:    addi a1, a1, 7
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 6
 ; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    li a2, 20
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -809,72 +782,66 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv4r.v v8, v16
 ; RV64-NEXT:    vrgather.vi v12, v16, 2
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgather.vi v12, v16, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v7, a1
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v4, 2
-; RV64-NEXT:    vadd.vi v2, v4, -14
+; RV64-NEXT:    vadd.vi v10, v2, 2
+; RV64-NEXT:    vadd.vi v4, v2, -14
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v10
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v2, v0.t
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 20
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
+; RV64-NEXT:    li a2, 36
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -882,194 +849,168 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v28, v24, 4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv2r.v v8, v4
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v4, 3
-; RV64-NEXT:    vadd.vi v6, v8, -13
-; RV64-NEXT:    vmv2r.v v2, v8
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v30, v2, 3
+; RV64-NEXT:    vadd.vi v28, v2, -13
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v4
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v6, v0.t
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v30
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv4r.v v16, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v4, v16, 5, v0.t
+; RV64-NEXT:    vrgather.vi v4, v24, 5, v0.t
 ; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    li a2, 192
-; RV64-NEXT:    vmv.s.x v1, a2
+; RV64-NEXT:    vmv.s.x v8, a2
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v24, v9, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 28
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v1, a1
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v22, v2, 4
-; RV64-NEXT:    vadd.vi v20, v2, -12
+; RV64-NEXT:    vadd.vi v10, v2, 4
+; RV64-NEXT:    vadd.vi v12, v2, -12
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v22
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v10
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
 ; RV64-NEXT:    lui a1, 112
 ; RV64-NEXT:    addi a1, a1, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a1
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 36
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v20, v16, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v24, v9, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 20
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    vmv.v.v v20, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, 5
+; RV64-NEXT:    vadd.vi v20, v2, 5
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v12
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, -11
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v20, v2, -11
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    vmv4r.v v12, v4
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v12, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v8
+; RV64-NEXT:    vmv.v.v v20, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v16, v8
 ; RV64-NEXT:    addi a1, a0, 320
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    vse64.v v20, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    li a3, 44
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -1077,22 +1018,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    li a3, 12
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 6
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 6
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index eb5da36116af37..f27614c93985f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -88,13 +88,13 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    ret
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
@@ -187,25 +187,23 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v9
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %a = call <3 x i64> @llvm.llrint.v3i64.v3f32(<3 x float> %x)
   ret <3 x i64> %a
@@ -298,25 +296,23 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v9
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
@@ -427,37 +423,37 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:    fcvt.l.s a4, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
+; RV64-NEXT:    fcvt.l.s a5, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.s a6, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    sd a4, 32(sp)
 ; RV64-NEXT:    sd a3, 40(sp)
 ; RV64-NEXT:    sd a2, 48(sp)
 ; RV64-NEXT:    sd a1, 56(sp)
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 3
-; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    sd a0, 0(sp)
-; RV64-NEXT:    sd a3, 8(sp)
-; RV64-NEXT:    sd a2, 16(sp)
-; RV64-NEXT:    sd a1, 24(sp)
+; RV64-NEXT:    sd a1, 8(sp)
+; RV64-NEXT:    sd a6, 16(sp)
+; RV64-NEXT:    sd a5, 24(sp)
 ; RV64-NEXT:    mv a0, sp
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
@@ -619,62 +615,62 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    flw fa5, 124(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vslidedown.vi v11, v8, 2
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    sd a0, 248(sp)
 ; RV64-NEXT:    flw fa5, 120(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 240(sp)
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    fcvt.l.s a0, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    fcvt.l.s a1, fa5
+; RV64-NEXT:    sd a1, 240(sp)
 ; RV64-NEXT:    flw fa5, 116(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 232(sp)
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v14, v8, 7
+; RV64-NEXT:    fcvt.l.s a1, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v11
+; RV64-NEXT:    fcvt.l.s a2, fa5
+; RV64-NEXT:    sd a2, 232(sp)
 ; RV64-NEXT:    flw fa5, 112(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 224(sp)
+; RV64-NEXT:    fcvt.l.s a2, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v12
+; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    fcvt.l.s a3, fa5
+; RV64-NEXT:    sd a3, 224(sp)
 ; RV64-NEXT:    flw fa5, 108(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 216(sp)
+; RV64-NEXT:    fcvt.l.s a3, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v14
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    fcvt.l.s a4, fa5
+; RV64-NEXT:    sd a4, 216(sp)
 ; RV64-NEXT:    flw fa5, 104(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 208(sp)
+; RV64-NEXT:    fcvt.l.s a4, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    fcvt.l.s a5, fa4
+; RV64-NEXT:    fcvt.l.s a6, fa5
+; RV64-NEXT:    sd a6, 208(sp)
 ; RV64-NEXT:    flw fa5, 100(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 200(sp)
-; RV64-NEXT:    flw fa5, 96(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 192(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v12
+; RV64-NEXT:    fcvt.l.s a6, fa4
+; RV64-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-NEXT:    fcvt.l.s a7, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a3, fa5
+; RV64-NEXT:    sd a7, 200(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    flw fa5, 96(sp)
 ; RV64-NEXT:    sd a0, 128(sp)
 ; RV64-NEXT:    sd a3, 136(sp)
 ; RV64-NEXT:    sd a2, 144(sp)
 ; RV64-NEXT:    sd a1, 152(sp)
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    sd a7, 160(sp)
+; RV64-NEXT:    sd a6, 168(sp)
+; RV64-NEXT:    sd a5, 176(sp)
+; RV64-NEXT:    sd a4, 184(sp)
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a3, fa5
-; RV64-NEXT:    sd a3, 160(sp)
-; RV64-NEXT:    sd a2, 168(sp)
-; RV64-NEXT:    sd a1, 176(sp)
-; RV64-NEXT:    sd a0, 184(sp)
+; RV64-NEXT:    sd a0, 192(sp)
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
@@ -775,12 +771,12 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.d a1, fa5
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    ret
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
@@ -871,21 +867,22 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.d a1, fa5
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.d a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v12
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
@@ -987,34 +984,34 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    fld fa5, 56(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 120(sp)
-; RV64-NEXT:    fld fa5, 48(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 112(sp)
-; RV64-NEXT:    fld fa5, 40(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 104(sp)
-; RV64-NEXT:    fld fa5, 32(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 96(sp)
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.d a0, fa4
 ; RV64-NEXT:    fcvt.l.d a1, fa5
+; RV64-NEXT:    sd a1, 120(sp)
+; RV64-NEXT:    fld fa5, 48(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.d a1, fa4
 ; RV64-NEXT:    fcvt.l.d a2, fa5
+; RV64-NEXT:    sd a2, 112(sp)
+; RV64-NEXT:    fld fa5, 40(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    fcvt.l.d a2, fa4
 ; RV64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-NEXT:    fcvt.l.d a3, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    sd a3, 104(sp)
 ; RV64-NEXT:    fcvt.l.d a3, fa5
+; RV64-NEXT:    fld fa5, 32(sp)
 ; RV64-NEXT:    sd a0, 64(sp)
 ; RV64-NEXT:    sd a1, 72(sp)
 ; RV64-NEXT:    sd a3, 80(sp)
 ; RV64-NEXT:    sd a2, 88(sp)
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    sd a0, 96(sp)
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
index 8f1e026d09c0a2..356bc5edd77a19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
@@ -88,14 +88,14 @@ define <6 x i1> @load_v6i1(ptr %p) {
 ; RV32-NEXT:    lbu a0, 0(a0)
 ; RV32-NEXT:    srli a1, a0, 5
 ; RV32-NEXT:    slli a2, a0, 27
-; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    slli a3, a0, 28
-; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    slli a4, a0, 29
-; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    slli a5, a0, 30
-; RV32-NEXT:    srli a5, a5, 31
 ; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    srli a5, a5, 31
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
@@ -113,14 +113,14 @@ define <6 x i1> @load_v6i1(ptr %p) {
 ; RV64-NEXT:    lbu a0, 0(a0)
 ; RV64-NEXT:    srli a1, a0, 5
 ; RV64-NEXT:    slli a2, a0, 59
-; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    slli a3, a0, 60
-; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    slli a4, a0, 61
-; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    slli a5, a0, 62
-; RV64-NEXT:    srli a5, a5, 63
 ; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    srli a5, a5, 63
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vslide1down.vx v8, v8, a5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 0e98fd1ab0f5dd..2f58e3dd2769f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -41,37 +41,37 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
 ; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v2f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    vmv.v.x v8, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v2f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v8, a1
-; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    vmv.v.x v8, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i64-NEXT:    ret
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
   ret <2 x iXLen> %a
@@ -83,65 +83,63 @@ define <3 x iXLen> @lrint_v3f32(<3 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslide1down.vx v8, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v3f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vmv.v.x v9, a1
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslide1down.vx v8, v9, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v3f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a1
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
+; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <3 x iXLen> @llvm.lrint.v3iXLen.v3f32(<3 x float> %x)
   ret <3 x iXLen> %a
@@ -153,65 +151,63 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslide1down.vx v8, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v4f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vmv.v.x v9, a1
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslide1down.vx v8, v9, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v4f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a1
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
+; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
   ret <4 x iXLen> %a
@@ -223,82 +219,74 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslidedown.vi v11, v8, 2
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v10
 ; RV32-NEXT:    fcvt.w.s a1, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v11
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vfmv.f.s fa5, v12
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslide1down.vx v8, v10, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v8f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i32-NEXT:    fcvt.l.s a1, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v11
 ; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v10, a1
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-i32-NEXT:    vfmv.f.s fa5, v12
+; RV64-i32-NEXT:    vmv.v.x v10, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v12, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-i32-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v12
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-i32-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v8f32:
@@ -314,37 +302,37 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; RV64-i64-NEXT:    andi sp, sp, -64
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-i64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-i64-NEXT:    fcvt.l.s a4, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
+; RV64-i64-NEXT:    fcvt.l.s a5, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.s a6, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    sd a4, 32(sp)
 ; RV64-i64-NEXT:    sd a3, 40(sp)
 ; RV64-i64-NEXT:    sd a2, 48(sp)
 ; RV64-i64-NEXT:    sd a1, 56(sp)
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 3
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    sd a0, 0(sp)
-; RV64-i64-NEXT:    sd a3, 8(sp)
-; RV64-i64-NEXT:    sd a2, 16(sp)
-; RV64-i64-NEXT:    sd a1, 24(sp)
+; RV64-i64-NEXT:    sd a1, 8(sp)
+; RV64-i64-NEXT:    sd a6, 16(sp)
+; RV64-i64-NEXT:    sd a5, 24(sp)
 ; RV64-i64-NEXT:    mv a0, sp
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vle64.v v8, (a0)
@@ -378,62 +366,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    flw fa5, 60(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vslidedown.vi v11, v8, 2
 ; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    sw a0, 124(sp)
 ; RV32-NEXT:    flw fa5, 56(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 120(sp)
+; RV32-NEXT:    fcvt.w.s a0, fa4
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    fcvt.w.s a1, fa5
+; RV32-NEXT:    sw a1, 120(sp)
 ; RV32-NEXT:    flw fa5, 52(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 116(sp)
+; RV32-NEXT:    fcvt.w.s a1, fa4
+; RV32-NEXT:    vfmv.f.s fa4, v11
+; RV32-NEXT:    fcvt.w.s a2, fa4
+; RV32-NEXT:    fcvt.w.s a3, fa5
+; RV32-NEXT:    sw a3, 116(sp)
 ; RV32-NEXT:    flw fa5, 48(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 112(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 7
+; RV32-NEXT:    fcvt.w.s a3, fa4
+; RV32-NEXT:    fcvt.w.s a4, fa5
+; RV32-NEXT:    sw a4, 112(sp)
 ; RV32-NEXT:    flw fa5, 44(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 108(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 6
+; RV32-NEXT:    fcvt.w.s a4, fa4
+; RV32-NEXT:    fcvt.w.s a5, fa5
+; RV32-NEXT:    sw a5, 108(sp)
 ; RV32-NEXT:    flw fa5, 40(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 104(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
+; RV32-NEXT:    fcvt.w.s a5, fa4
+; RV32-NEXT:    fcvt.w.s a6, fa5
+; RV32-NEXT:    sw a6, 104(sp)
 ; RV32-NEXT:    flw fa5, 36(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 100(sp)
-; RV32-NEXT:    flw fa5, 32(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 96(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    fcvt.w.s a6, fa4
+; RV32-NEXT:    vslidedown.vi v8, v8, 4
+; RV32-NEXT:    fcvt.w.s a7, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a2, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a3, fa5
+; RV32-NEXT:    sw a7, 100(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    flw fa5, 32(sp)
 ; RV32-NEXT:    sw a0, 64(sp)
 ; RV32-NEXT:    sw a3, 68(sp)
 ; RV32-NEXT:    sw a2, 72(sp)
 ; RV32-NEXT:    sw a1, 76(sp)
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 7
-; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    sw a7, 80(sp)
+; RV32-NEXT:    sw a6, 84(sp)
+; RV32-NEXT:    sw a5, 88(sp)
+; RV32-NEXT:    sw a4, 92(sp)
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a2, fa5
-; RV32-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a3, fa5
-; RV32-NEXT:    sw a3, 80(sp)
-; RV32-NEXT:    sw a2, 84(sp)
-; RV32-NEXT:    sw a1, 88(sp)
-; RV32-NEXT:    sw a0, 92(sp)
+; RV32-NEXT:    sw a0, 96(sp)
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
@@ -462,62 +450,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i32-NEXT:    vse32.v v8, (a0)
 ; RV64-i32-NEXT:    flw fa5, 60(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v8
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-i32-NEXT:    vslidedown.vi v11, v8, 2
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    sw a0, 124(sp)
 ; RV64-i32-NEXT:    flw fa5, 56(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 120(sp)
+; RV64-i32-NEXT:    fcvt.l.s a0, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
+; RV64-i32-NEXT:    sw a1, 120(sp)
 ; RV64-i32-NEXT:    flw fa5, 52(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 116(sp)
+; RV64-i32-NEXT:    fcvt.l.s a1, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa4, v11
+; RV64-i32-NEXT:    fcvt.l.s a2, fa4
+; RV64-i32-NEXT:    fcvt.l.s a3, fa5
+; RV64-i32-NEXT:    sw a3, 116(sp)
 ; RV64-i32-NEXT:    flw fa5, 48(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 112(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-i32-NEXT:    fcvt.l.s a3, fa4
+; RV64-i32-NEXT:    fcvt.l.s a4, fa5
+; RV64-i32-NEXT:    sw a4, 112(sp)
 ; RV64-i32-NEXT:    flw fa5, 44(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 108(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i32-NEXT:    fcvt.l.s a4, fa4
+; RV64-i32-NEXT:    fcvt.l.s a5, fa5
+; RV64-i32-NEXT:    sw a5, 108(sp)
 ; RV64-i32-NEXT:    flw fa5, 40(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 104(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-i32-NEXT:    fcvt.l.s a5, fa4
+; RV64-i32-NEXT:    fcvt.l.s a6, fa5
+; RV64-i32-NEXT:    sw a6, 104(sp)
 ; RV64-i32-NEXT:    flw fa5, 36(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 100(sp)
-; RV64-i32-NEXT:    flw fa5, 32(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 96(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    fcvt.l.s a6, fa4
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a2, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a3, fa5
+; RV64-i32-NEXT:    sw a7, 100(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    flw fa5, 32(sp)
 ; RV64-i32-NEXT:    sw a0, 64(sp)
 ; RV64-i32-NEXT:    sw a3, 68(sp)
 ; RV64-i32-NEXT:    sw a2, 72(sp)
 ; RV64-i32-NEXT:    sw a1, 76(sp)
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    sw a7, 80(sp)
+; RV64-i32-NEXT:    sw a6, 84(sp)
+; RV64-i32-NEXT:    sw a5, 88(sp)
+; RV64-i32-NEXT:    sw a4, 92(sp)
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a2, fa5
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a3, fa5
-; RV64-i32-NEXT:    sw a3, 80(sp)
-; RV64-i32-NEXT:    sw a2, 84(sp)
-; RV64-i32-NEXT:    sw a1, 88(sp)
-; RV64-i32-NEXT:    sw a0, 92(sp)
+; RV64-i32-NEXT:    sw a0, 96(sp)
 ; RV64-i32-NEXT:    addi a0, sp, 64
 ; RV64-i32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i32-NEXT:    vle32.v v8, (a0)
@@ -546,62 +534,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i64-NEXT:    vse32.v v8, (a0)
 ; RV64-i64-NEXT:    flw fa5, 124(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v8
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v11, v8, 2
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    sd a0, 248(sp)
 ; RV64-i64-NEXT:    flw fa5, 120(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 240(sp)
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-i64-NEXT:    fcvt.l.s a0, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    fcvt.l.s a1, fa5
+; RV64-i64-NEXT:    sd a1, 240(sp)
 ; RV64-i64-NEXT:    flw fa5, 116(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 232(sp)
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v14, v8, 7
+; RV64-i64-NEXT:    fcvt.l.s a1, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v11
+; RV64-i64-NEXT:    fcvt.l.s a2, fa5
+; RV64-i64-NEXT:    sd a2, 232(sp)
 ; RV64-i64-NEXT:    flw fa5, 112(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 224(sp)
+; RV64-i64-NEXT:    fcvt.l.s a2, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v12
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i64-NEXT:    fcvt.l.s a3, fa5
+; RV64-i64-NEXT:    sd a3, 224(sp)
 ; RV64-i64-NEXT:    flw fa5, 108(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 216(sp)
+; RV64-i64-NEXT:    fcvt.l.s a3, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v14
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-i64-NEXT:    fcvt.l.s a4, fa5
+; RV64-i64-NEXT:    sd a4, 216(sp)
 ; RV64-i64-NEXT:    flw fa5, 104(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 208(sp)
+; RV64-i64-NEXT:    fcvt.l.s a4, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    fcvt.l.s a5, fa4
+; RV64-i64-NEXT:    fcvt.l.s a6, fa5
+; RV64-i64-NEXT:    sd a6, 208(sp)
 ; RV64-i64-NEXT:    flw fa5, 100(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 200(sp)
-; RV64-i64-NEXT:    flw fa5, 96(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 192(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v12
+; RV64-i64-NEXT:    fcvt.l.s a6, fa4
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
+; RV64-i64-NEXT:    sd a7, 200(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    flw fa5, 96(sp)
 ; RV64-i64-NEXT:    sd a0, 128(sp)
 ; RV64-i64-NEXT:    sd a3, 136(sp)
 ; RV64-i64-NEXT:    sd a2, 144(sp)
 ; RV64-i64-NEXT:    sd a1, 152(sp)
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    sd a7, 160(sp)
+; RV64-i64-NEXT:    sd a6, 168(sp)
+; RV64-i64-NEXT:    sd a5, 176(sp)
+; RV64-i64-NEXT:    sd a4, 184(sp)
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
-; RV64-i64-NEXT:    sd a3, 160(sp)
-; RV64-i64-NEXT:    sd a2, 168(sp)
-; RV64-i64-NEXT:    sd a1, 176(sp)
-; RV64-i64-NEXT:    sd a0, 184(sp)
+; RV64-i64-NEXT:    sd a0, 192(sp)
 ; RV64-i64-NEXT:    addi a0, sp, 128
 ; RV64-i64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-i64-NEXT:    vle64.v v8, (a0)
@@ -653,38 +641,38 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
 ; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v2f64:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    vmv.v.x v8, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v2f64:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.d a1, fa5
-; RV64-i64-NEXT:    vmv.v.x v8, a1
-; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    vmv.v.x v8, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i64-NEXT:    ret
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
   ret <2 x iXLen> %a
@@ -696,71 +684,70 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a1, fa5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    vslide1down.vx v8, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v4f64:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a1, fa5
-; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v10, a1
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
+; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v9, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v4f64:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.d a1, fa5
-; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a1
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v12
+; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
   ret <4 x iXLen> %a
@@ -780,21 +767,22 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a2, fa5
+; RV32-NEXT:    vslidedown.vi v14, v8, 2
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a3, fa5
+; RV32-NEXT:    vfmv.f.s fa4, v12
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v14
+; RV32-NEXT:    vfmv.f.s fa3, v8
+; RV32-NEXT:    fcvt.w.d a1, fa4
+; RV32-NEXT:    fcvt.w.d a2, fa5
+; RV32-NEXT:    fcvt.w.d a3, fa3
 ; RV32-NEXT:    fld fa5, 32(sp)
 ; RV32-NEXT:    fld fa4, 40(sp)
 ; RV32-NEXT:    fld fa3, 48(sp)
@@ -803,8 +791,8 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    fcvt.w.d a5, fa4
 ; RV32-NEXT:    fcvt.w.d a6, fa3
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-NEXT:    vslide1down.vx v8, v8, a4
@@ -834,21 +822,22 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-i32-NEXT:    andi sp, sp, -64
 ; RV64-i32-NEXT:    mv a0, sp
-; RV64-i32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-i32-NEXT:    vse64.v v8, (a0)
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a2, fa5
+; RV64-i32-NEXT:    vslidedown.vi v14, v8, 2
+; RV64-i32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-i32-NEXT:    vse64.v v8, (a0)
+; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a3, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa4, v12
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v14
+; RV64-i32-NEXT:    vfmv.f.s fa3, v8
+; RV64-i32-NEXT:    fcvt.l.d a1, fa4
+; RV64-i32-NEXT:    fcvt.l.d a2, fa5
+; RV64-i32-NEXT:    fcvt.l.d a3, fa3
 ; RV64-i32-NEXT:    fld fa5, 32(sp)
 ; RV64-i32-NEXT:    fld fa4, 40(sp)
 ; RV64-i32-NEXT:    fld fa3, 48(sp)
@@ -857,8 +846,8 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    fcvt.l.d a5, fa4
 ; RV64-i32-NEXT:    fcvt.l.d a6, fa3
 ; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    vmv.v.x v8, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a4
@@ -891,34 +880,34 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vse64.v v8, (a0)
 ; RV64-i64-NEXT:    fld fa5, 56(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 120(sp)
-; RV64-i64-NEXT:    fld fa5, 48(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 112(sp)
-; RV64-i64-NEXT:    fld fa5, 40(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 104(sp)
-; RV64-i64-NEXT:    fld fa5, 32(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 96(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.d a0, fa4
 ; RV64-i64-NEXT:    fcvt.l.d a1, fa5
+; RV64-i64-NEXT:    sd a1, 120(sp)
+; RV64-i64-NEXT:    fld fa5, 48(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.d a1, fa4
 ; RV64-i64-NEXT:    fcvt.l.d a2, fa5
+; RV64-i64-NEXT:    sd a2, 112(sp)
+; RV64-i64-NEXT:    fld fa5, 40(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    fcvt.l.d a2, fa4
 ; RV64-i64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-i64-NEXT:    fcvt.l.d a3, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    sd a3, 104(sp)
 ; RV64-i64-NEXT:    fcvt.l.d a3, fa5
+; RV64-i64-NEXT:    fld fa5, 32(sp)
 ; RV64-i64-NEXT:    sd a0, 64(sp)
 ; RV64-i64-NEXT:    sd a1, 72(sp)
 ; RV64-i64-NEXT:    sd a3, 80(sp)
 ; RV64-i64-NEXT:    sd a2, 88(sp)
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    sd a0, 96(sp)
 ; RV64-i64-NEXT:    addi a0, sp, 64
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vle64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index 5b9af1a3cfe233..c29ccd45528b81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -282,11 +282,11 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -299,11 +299,11 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    li a0, 1
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
-; ZVE32F-NEXT:    vmv.v.i v0, 15
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
@@ -327,11 +327,11 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -344,11 +344,11 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    li a0, 1
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
-; ZVE32F-NEXT:    vmv.v.i v0, 15
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
@@ -370,12 +370,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -386,12 +386,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; ZVE32F-NEXT:    vmv.v.i v0, 15
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 1ca34e9dfd1be3..6cc3f7e76797bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -345,14 +345,14 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    srai a2, a1, 31
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    srai a4, a3, 31
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a2, 12(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    srai a3, a1, 31
+; RV32ZVE32F-NEXT:    srai a4, a2, 31
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 8(a0)
+; RV32ZVE32F-NEXT:    sw a4, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64:
@@ -408,13 +408,13 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    sw zero, 12(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    andi a1, a1, 255
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
 ; RV32ZVE32F-NEXT:    andi a2, a2, 255
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
+; RV32ZVE32F-NEXT:    sw a2, 8(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64:
@@ -439,8 +439,8 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV64ZVE32F-NEXT:  .LBB7_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    andi a0, a0, 255
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
+; RV64ZVE32F-NEXT:    andi a0, a0, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    andi a1, a1, 255
 ; RV64ZVE32F-NEXT:    ret
@@ -1038,14 +1038,14 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    srai a2, a1, 31
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    srai a4, a3, 31
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a2, 12(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    srai a3, a1, 31
+; RV32ZVE32F-NEXT:    srai a4, a2, 31
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 8(a0)
+; RV32ZVE32F-NEXT:    sw a4, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64:
@@ -1100,15 +1100,15 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV32ZVE32F-NEXT:    lui a1, 16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    addi a1, a1, -1
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV32ZVE32F-NEXT:    and a2, a2, a1
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
 ; RV32ZVE32F-NEXT:    and a1, a3, a1
-; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a2, 0(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
-; RV32ZVE32F-NEXT:    sw a2, 8(a0)
+; RV32ZVE32F-NEXT:    sw a1, 8(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
@@ -1135,10 +1135,10 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    lui a1, 16
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    and a0, a0, a1
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
+; RV64ZVE32F-NEXT:    addiw a1, a1, -1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    and a0, a0, a1
 ; RV64ZVE32F-NEXT:    and a1, a2, a1
 ; RV64ZVE32F-NEXT:    ret
   %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
@@ -2100,15 +2100,15 @@ define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
-; RV32ZVE32F-NEXT:    srai a2, a2, 31
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    srai a3, a3, 31
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vse32.v v9, (a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV32ZVE32F-NEXT:    srai a2, a2, 31
 ; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
-; RV32ZVE32F-NEXT:    sw a3, 4(a0)
-; RV32ZVE32F-NEXT:    sw a2, 12(a0)
+; RV32ZVE32F-NEXT:    srai a3, a3, 31
+; RV32ZVE32F-NEXT:    sw a2, 4(a0)
+; RV32ZVE32F-NEXT:    sw a3, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64:
@@ -2193,10 +2193,10 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV64ZVE32F-NEXT:  .LBB30_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    slli a0, a0, 32
-; RV64ZVE32F-NEXT:    srli a0, a0, 32
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
+; RV64ZVE32F-NEXT:    slli a0, a0, 32
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    srli a0, a0, 32
 ; RV64ZVE32F-NEXT:    slli a1, a1, 32
 ; RV64ZVE32F-NEXT:    srli a1, a1, 32
 ; RV64ZVE32F-NEXT:    ret
@@ -3776,28 +3776,28 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) {
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a2, 0(a1)
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    lw a4, 0(a3)
-; RV32ZVE32F-NEXT:    lw a3, 4(a3)
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a6, 0(a5)
-; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v8
-; RV32ZVE32F-NEXT:    lw t0, 0(a7)
-; RV32ZVE32F-NEXT:    lw a7, 4(a7)
-; RV32ZVE32F-NEXT:    sw a6, 16(a0)
-; RV32ZVE32F-NEXT:    sw a5, 20(a0)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    vmv.x.s a4, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v8
+; RV32ZVE32F-NEXT:    lw a6, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    lw a7, 0(a4)
+; RV32ZVE32F-NEXT:    lw a4, 4(a4)
+; RV32ZVE32F-NEXT:    lw t0, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw a4, 20(a0)
 ; RV32ZVE32F-NEXT:    sw t0, 24(a0)
-; RV32ZVE32F-NEXT:    sw a7, 28(a0)
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 28(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a4, 8(a0)
-; RV32ZVE32F-NEXT:    sw a3, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 8(a0)
+; RV32ZVE32F-NEXT:    sw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i64:
@@ -4132,11 +4132,11 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB48_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -4409,11 +4409,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB49_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -4688,11 +4688,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB50_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -4974,11 +4974,11 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB51_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -5252,11 +5252,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB52_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -5532,11 +5532,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB53_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -6666,6 +6666,9 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    lw a2, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, t0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -6674,10 +6677,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
 ; RV32ZVE32F-NEXT:    andi a2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a2, .LBB57_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -11097,14 +11097,14 @@ define <4 x double> @mgather_truemask_v4f64(<4 x ptr> %ptrs, <4 x double> %passt
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
-; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV32ZVE32F-NEXT:    fld fa3, 0(a1)
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV32ZVE32F-NEXT:    fld fa2, 0(a1)
 ; RV32ZVE32F-NEXT:    fsd fa5, 0(a0)
@@ -11375,11 +11375,11 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB97_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11590,11 +11590,11 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB98_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11807,11 +11807,11 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB99_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12031,11 +12031,11 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB100_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12247,11 +12247,11 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB101_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12465,11 +12465,11 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB102_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -13348,21 +13348,21 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    lw a7, 0(a2)
 ; RV32ZVE32F-NEXT:    lw t0, 8(a2)
 ; RV32ZVE32F-NEXT:    lw t1, 16(a2)
-; RV32ZVE32F-NEXT:    lw a2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, a7
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    andi a3, a2, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB106_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -13807,14 +13807,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64V-NEXT:    vsext.vf8 v16, v8
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v10, 16
+; RV64V-NEXT:    vslidedown.vi v14, v8, 16
+; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64V-NEXT:    vslidedown.vi v8, v0, 2
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-NEXT:    vluxei64.v v10, (a0), v16, v0.t
-; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
-; RV64V-NEXT:    vslidedown.vi v8, v8, 16
-; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64V-NEXT:    vslidedown.vi v0, v0, 2
-; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64V-NEXT:    vsext.vf8 v16, v8
+; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vsext.vf8 v16, v14
+; RV64V-NEXT:    vmv1r.v v0, v8
 ; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    li a0, 32
@@ -14384,65 +14384,65 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vadd.vx v8, v8, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    lbu a1, 0(a0)
-; RV32-NEXT:    lbu a0, 1(a0)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    lbu a3, 1(a2)
-; RV32-NEXT:    lbu a2, 0(a2)
-; RV32-NEXT:    slli a0, a0, 8
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vslidedown.vi v11, v8, 2
 ; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    lbu a4, 0(a1)
-; RV32-NEXT:    lbu a1, 1(a1)
 ; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    lbu a6, 0(a5)
-; RV32-NEXT:    lbu a5, 1(a5)
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    slli a1, a1, 8
-; RV32-NEXT:    or a1, a1, a4
-; RV32-NEXT:    slli a5, a5, 8
+; RV32-NEXT:    vmv.x.s a2, v11
+; RV32-NEXT:    vmv.x.s a3, v10
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    lbu a4, 0(a3)
-; RV32-NEXT:    lbu a3, 1(a3)
+; RV32-NEXT:    vmv.x.s a4, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a7, v10
-; RV32-NEXT:    lbu t0, 0(a7)
-; RV32-NEXT:    lbu a7, 1(a7)
-; RV32-NEXT:    or a5, a5, a6
-; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    slli a7, a7, 8
+; RV32-NEXT:    vmv.x.s a5, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    lbu a6, 0(a4)
-; RV32-NEXT:    lbu a4, 1(a4)
 ; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vmv.x.s t1, v8
-; RV32-NEXT:    lbu t2, 0(t1)
-; RV32-NEXT:    lbu t1, 1(t1)
-; RV32-NEXT:    or a7, a7, t0
+; RV32-NEXT:    lbu a6, 0(a0)
+; RV32-NEXT:    lbu a0, 1(a0)
+; RV32-NEXT:    vmv.x.s a7, v10
+; RV32-NEXT:    vmv.x.s t0, v8
+; RV32-NEXT:    lbu t1, 0(a1)
+; RV32-NEXT:    lbu a1, 1(a1)
+; RV32-NEXT:    lbu t2, 0(a2)
+; RV32-NEXT:    lbu a2, 1(a2)
+; RV32-NEXT:    slli a0, a0, 8
+; RV32-NEXT:    or a0, a0, a6
+; RV32-NEXT:    lbu a6, 0(a3)
+; RV32-NEXT:    lbu a3, 1(a3)
+; RV32-NEXT:    slli a1, a1, 8
+; RV32-NEXT:    or a1, a1, t1
+; RV32-NEXT:    lbu t1, 0(a4)
+; RV32-NEXT:    lbu a4, 1(a4)
+; RV32-NEXT:    slli a2, a2, 8
+; RV32-NEXT:    or a2, a2, t2
+; RV32-NEXT:    lbu t2, 0(a5)
+; RV32-NEXT:    lbu a5, 1(a5)
+; RV32-NEXT:    slli a3, a3, 8
+; RV32-NEXT:    or a3, a3, a6
+; RV32-NEXT:    lbu a6, 0(a7)
+; RV32-NEXT:    lbu a7, 1(a7)
 ; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    or a4, a4, a6
-; RV32-NEXT:    slli t1, t1, 8
-; RV32-NEXT:    or a6, t1, t2
+; RV32-NEXT:    or a4, a4, t1
+; RV32-NEXT:    lbu t1, 0(t0)
+; RV32-NEXT:    lbu t0, 1(t0)
+; RV32-NEXT:    slli a5, a5, 8
+; RV32-NEXT:    or a5, a5, t2
+; RV32-NEXT:    slli a7, a7, 8
+; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    slli t0, t0, 8
+; RV32-NEXT:    or a7, t0, t1
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v9, v8, a5
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vmv.v.x v9, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v9, v9, a5
+; RV32-NEXT:    vslide1down.vx v10, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v9, a6
 ; RV32-NEXT:    vmv.v.i v0, 15
-; RV32-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_strided_unaligned:
@@ -14458,65 +14458,65 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64V-NEXT:    andi sp, sp, -64
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    mv a1, sp
 ; RV64V-NEXT:    vsll.vi v8, v8, 2
 ; RV64V-NEXT:    vadd.vx v8, v8, a0
 ; RV64V-NEXT:    vmv.x.s a0, v8
-; RV64V-NEXT:    lbu a1, 0(a0)
-; RV64V-NEXT:    lbu a0, 1(a0)
 ; RV64V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64V-NEXT:    vmv.x.s a2, v12
-; RV64V-NEXT:    lbu a3, 1(a2)
-; RV64V-NEXT:    lbu a2, 0(a2)
-; RV64V-NEXT:    slli a0, a0, 8
-; RV64V-NEXT:    or a0, a0, a1
-; RV64V-NEXT:    slli a3, a3, 8
 ; RV64V-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 2
-; RV64V-NEXT:    vmv.x.s a1, v12
-; RV64V-NEXT:    lbu a4, 0(a1)
-; RV64V-NEXT:    lbu a1, 1(a1)
+; RV64V-NEXT:    vmv.x.s a3, v12
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 3
+; RV64V-NEXT:    lbu a4, 0(a0)
+; RV64V-NEXT:    lbu a0, 1(a0)
 ; RV64V-NEXT:    vmv.x.s a5, v12
-; RV64V-NEXT:    lbu a6, 0(a5)
+; RV64V-NEXT:    lbu a6, 0(a2)
+; RV64V-NEXT:    lbu a2, 1(a2)
+; RV64V-NEXT:    lbu a7, 0(a3)
+; RV64V-NEXT:    lbu a3, 1(a3)
+; RV64V-NEXT:    lbu t0, 0(a5)
 ; RV64V-NEXT:    lbu a5, 1(a5)
-; RV64V-NEXT:    or a2, a3, a2
-; RV64V-NEXT:    slli a1, a1, 8
-; RV64V-NEXT:    or a1, a1, a4
-; RV64V-NEXT:    slli a5, a5, 8
-; RV64V-NEXT:    mv a3, sp
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64V-NEXT:    vse64.v v8, (a3)
+; RV64V-NEXT:    vse64.v v8, (a1)
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    or a0, a0, a4
+; RV64V-NEXT:    slli a2, a2, 8
+; RV64V-NEXT:    slli a3, a3, 8
+; RV64V-NEXT:    or a1, a2, a6
+; RV64V-NEXT:    or a2, a3, a7
 ; RV64V-NEXT:    ld a3, 32(sp)
 ; RV64V-NEXT:    ld a4, 40(sp)
-; RV64V-NEXT:    ld a7, 48(sp)
-; RV64V-NEXT:    ld t0, 56(sp)
-; RV64V-NEXT:    lbu t1, 0(a3)
+; RV64V-NEXT:    ld a6, 48(sp)
+; RV64V-NEXT:    ld a7, 56(sp)
+; RV64V-NEXT:    slli a5, a5, 8
+; RV64V-NEXT:    or a5, a5, t0
+; RV64V-NEXT:    lbu t0, 0(a3)
 ; RV64V-NEXT:    lbu a3, 1(a3)
-; RV64V-NEXT:    lbu t2, 0(a4)
+; RV64V-NEXT:    vmv.v.x v8, a0
+; RV64V-NEXT:    lbu a0, 0(a4)
 ; RV64V-NEXT:    lbu a4, 1(a4)
-; RV64V-NEXT:    or a5, a5, a6
+; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    lbu a1, 0(a6)
+; RV64V-NEXT:    lbu a6, 1(a6)
+; RV64V-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-NEXT:    lbu a2, 0(a7)
+; RV64V-NEXT:    lbu a7, 1(a7)
+; RV64V-NEXT:    vslide1down.vx v9, v8, a5
 ; RV64V-NEXT:    slli a3, a3, 8
-; RV64V-NEXT:    or a3, a3, t1
 ; RV64V-NEXT:    slli a4, a4, 8
-; RV64V-NEXT:    lbu a6, 0(a7)
-; RV64V-NEXT:    lbu a7, 1(a7)
-; RV64V-NEXT:    lbu t1, 0(t0)
-; RV64V-NEXT:    lbu t0, 1(t0)
-; RV64V-NEXT:    or a4, a4, t2
+; RV64V-NEXT:    slli a6, a6, 8
 ; RV64V-NEXT:    slli a7, a7, 8
-; RV64V-NEXT:    or a6, a7, a6
-; RV64V-NEXT:    slli t0, t0, 8
-; RV64V-NEXT:    or a7, t0, t1
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a2
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
-; RV64V-NEXT:    vslide1down.vx v9, v8, a5
+; RV64V-NEXT:    or a3, a3, t0
+; RV64V-NEXT:    or a0, a4, a0
+; RV64V-NEXT:    or a1, a6, a1
+; RV64V-NEXT:    or a2, a7, a2
 ; RV64V-NEXT:    vmv.v.x v8, a3
-; RV64V-NEXT:    vslide1down.vx v8, v8, a4
-; RV64V-NEXT:    vslide1down.vx v8, v8, a6
+; RV64V-NEXT:    vslide1down.vx v8, v8, a0
+; RV64V-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64V-NEXT:    vmv.v.i v0, 15
-; RV64V-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64V-NEXT:    addi sp, s0, -128
 ; RV64V-NEXT:    .cfi_def_cfa sp, 128
@@ -14530,49 +14530,49 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_strided_unaligned:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lbu a1, 1(a0)
-; RV64ZVE32F-NEXT:    lbu a2, 0(a0)
-; RV64ZVE32F-NEXT:    lbu a3, 5(a0)
-; RV64ZVE32F-NEXT:    lbu a4, 4(a0)
-; RV64ZVE32F-NEXT:    slli a1, a1, 8
-; RV64ZVE32F-NEXT:    or a1, a1, a2
-; RV64ZVE32F-NEXT:    slli a3, a3, 8
-; RV64ZVE32F-NEXT:    lbu a2, 8(a0)
-; RV64ZVE32F-NEXT:    lbu a5, 9(a0)
-; RV64ZVE32F-NEXT:    lbu a6, 12(a0)
-; RV64ZVE32F-NEXT:    lbu a7, 13(a0)
-; RV64ZVE32F-NEXT:    or a3, a3, a4
-; RV64ZVE32F-NEXT:    slli a5, a5, 8
-; RV64ZVE32F-NEXT:    or a2, a5, a2
-; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    lbu a4, 16(a0)
-; RV64ZVE32F-NEXT:    lbu a5, 17(a0)
-; RV64ZVE32F-NEXT:    lbu t0, 20(a0)
-; RV64ZVE32F-NEXT:    lbu t1, 21(a0)
-; RV64ZVE32F-NEXT:    or a6, a7, a6
-; RV64ZVE32F-NEXT:    slli a5, a5, 8
-; RV64ZVE32F-NEXT:    or a4, a5, a4
-; RV64ZVE32F-NEXT:    slli t1, t1, 8
-; RV64ZVE32F-NEXT:    lbu a5, 24(a0)
+; RV64ZVE32F-NEXT:    lbu a1, 0(a0)
+; RV64ZVE32F-NEXT:    lbu a2, 1(a0)
+; RV64ZVE32F-NEXT:    lbu a3, 4(a0)
+; RV64ZVE32F-NEXT:    lbu a4, 5(a0)
+; RV64ZVE32F-NEXT:    lbu a5, 8(a0)
+; RV64ZVE32F-NEXT:    lbu a6, 9(a0)
+; RV64ZVE32F-NEXT:    lbu a7, 12(a0)
+; RV64ZVE32F-NEXT:    lbu t0, 13(a0)
+; RV64ZVE32F-NEXT:    slli a2, a2, 8
+; RV64ZVE32F-NEXT:    slli a4, a4, 8
+; RV64ZVE32F-NEXT:    or a1, a2, a1
+; RV64ZVE32F-NEXT:    or a3, a4, a3
+; RV64ZVE32F-NEXT:    lbu a2, 16(a0)
+; RV64ZVE32F-NEXT:    lbu a4, 17(a0)
+; RV64ZVE32F-NEXT:    lbu t1, 20(a0)
+; RV64ZVE32F-NEXT:    lbu t2, 21(a0)
+; RV64ZVE32F-NEXT:    slli a6, a6, 8
+; RV64ZVE32F-NEXT:    or a5, a6, a5
+; RV64ZVE32F-NEXT:    slli t0, t0, 8
+; RV64ZVE32F-NEXT:    slli a4, a4, 8
+; RV64ZVE32F-NEXT:    slli t2, t2, 8
+; RV64ZVE32F-NEXT:    or a6, t0, a7
+; RV64ZVE32F-NEXT:    or a2, a4, a2
+; RV64ZVE32F-NEXT:    lbu a4, 24(a0)
 ; RV64ZVE32F-NEXT:    lbu a7, 25(a0)
-; RV64ZVE32F-NEXT:    lbu t2, 28(a0)
+; RV64ZVE32F-NEXT:    or t0, t2, t1
+; RV64ZVE32F-NEXT:    lbu t1, 28(a0)
 ; RV64ZVE32F-NEXT:    lbu a0, 29(a0)
-; RV64ZVE32F-NEXT:    or t0, t1, t0
 ; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    or a5, a7, a5
-; RV64ZVE32F-NEXT:    slli a0, a0, 8
-; RV64ZVE32F-NEXT:    or a0, a0, t2
+; RV64ZVE32F-NEXT:    or a4, a7, a4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    slli a0, a0, 8
+; RV64ZVE32F-NEXT:    or a0, a0, t1
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a6
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14606,16 +14606,16 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 24(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 26(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14651,16 +14651,16 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 28(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 30(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 2, i64 3, i64 6, i64 7, i64 10, i64 11, i64 14, i64 15>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14696,16 +14696,16 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a7
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 12, i64 13, i64 10, i64 11, i64 8, i64 9>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14741,16 +14741,16 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a7
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 10, i64 11, i64 6, i64 7, i64 2, i64 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14785,16 +14785,16 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 2, i32 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14832,16 +14832,16 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14871,24 +14871,24 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 10(a0)
-; RV64ZVE32F-NEXT:    lh a2, 18(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 2(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
-; RV64ZVE32F-NEXT:    lh a0, 8(a0)
+; RV64ZVE32F-NEXT:    lh a1, 2(a0)
+; RV64ZVE32F-NEXT:    lh a2, 4(a0)
+; RV64ZVE32F-NEXT:    lh a3, 6(a0)
+; RV64ZVE32F-NEXT:    lh a4, 8(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
+; RV64ZVE32F-NEXT:    lh a6, 18(a0)
+; RV64ZVE32F-NEXT:    lh a0, 20(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a3
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a3
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 1, i32 2, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14930,16 +14930,16 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14978,16 +14978,16 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -15035,16 +15035,16 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 4, i64 5, i64 6, i64 7, i64 0, i64 1, i64 2, i64 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -15083,16 +15083,16 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 0, i64 2, i64 3, i64 1, i64 4, i64 5, i64 6, i64 7>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -15152,258 +15152,258 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa s0, 0
 ; RV32ZVE32F-NEXT:    andi sp, sp, -128
 ; RV32ZVE32F-NEXT:    li a2, 32
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 236(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
+; RV32ZVE32F-NEXT:    sw a3, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi a3, sp, 256
 ; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32ZVE32F-NEXT:    vid.v v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 4
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 252(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 1
+; RV32ZVE32F-NEXT:    vslidedown.vi v17, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 244(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 240(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 220(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 216(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 212(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s a4, v17
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 236(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s a6, v16
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 228(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 224(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v16
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 204(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v16
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 7
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 196(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s t1, v16
+; RV32ZVE32F-NEXT:    lw t2, 0(a1)
+; RV32ZVE32F-NEXT:    sw t2, 196(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    sw a1, 192(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    addi a1, sp, 256
+; RV32ZVE32F-NEXT:    lw ra, 0(a4)
+; RV32ZVE32F-NEXT:    lw a1, 4(a4)
+; RV32ZVE32F-NEXT:    sw a1, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a5)
+; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a5)
+; RV32ZVE32F-NEXT:    sw a1, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a6)
+; RV32ZVE32F-NEXT:    sw a1, 252(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a6)
+; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a7)
+; RV32ZVE32F-NEXT:    sw a1, 244(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a7)
+; RV32ZVE32F-NEXT:    sw a1, 240(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(t0)
+; RV32ZVE32F-NEXT:    sw a1, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(t0)
+; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(t1)
+; RV32ZVE32F-NEXT:    sw a1, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(t1)
+; RV32ZVE32F-NEXT:    sw a1, 176(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    vse32.v v8, (a3)
 ; RV32ZVE32F-NEXT:    lw a1, 288(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 292(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 296(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 300(sp)
 ; RV32ZVE32F-NEXT:    lw a5, 0(a1)
-; RV32ZVE32F-NEXT:    sw a5, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a5, 228(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 224(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
-; RV32ZVE32F-NEXT:    sw a1, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 220(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    sw a1, 176(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 216(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
-; RV32ZVE32F-NEXT:    sw a1, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 212(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a4)
-; RV32ZVE32F-NEXT:    sw a1, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 204(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a4)
-; RV32ZVE32F-NEXT:    sw a1, 160(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 304(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 308(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 312(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 316(sp)
 ; RV32ZVE32F-NEXT:    lw a5, 0(a1)
-; RV32ZVE32F-NEXT:    sw a5, 156(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a5, 160(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 156(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
-; RV32ZVE32F-NEXT:    sw a1, 148(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    sw a1, 144(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 148(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
-; RV32ZVE32F-NEXT:    sw a1, 140(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 144(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 140(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a4)
-; RV32ZVE32F-NEXT:    sw a1, 132(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a4)
-; RV32ZVE32F-NEXT:    sw a1, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 132(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 320(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 324(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 328(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 332(sp)
-; RV32ZVE32F-NEXT:    lw s8, 0(a1)
-; RV32ZVE32F-NEXT:    lw s9, 4(a1)
-; RV32ZVE32F-NEXT:    lw s10, 0(a2)
-; RV32ZVE32F-NEXT:    lw s11, 4(a2)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    sw a5, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 124(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 116(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw s8, 0(a3)
+; RV32ZVE32F-NEXT:    lw s9, 4(a3)
+; RV32ZVE32F-NEXT:    lw s10, 0(a4)
+; RV32ZVE32F-NEXT:    lw s11, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 336(sp)
+; RV32ZVE32F-NEXT:    lw a2, 340(sp)
+; RV32ZVE32F-NEXT:    lw a3, 344(sp)
+; RV32ZVE32F-NEXT:    lw a4, 348(sp)
+; RV32ZVE32F-NEXT:    lw t5, 0(a1)
+; RV32ZVE32F-NEXT:    lw t6, 4(a1)
+; RV32ZVE32F-NEXT:    lw s2, 0(a2)
+; RV32ZVE32F-NEXT:    lw s3, 4(a2)
+; RV32ZVE32F-NEXT:    lw a5, 0(a3)
+; RV32ZVE32F-NEXT:    lw a6, 4(a3)
+; RV32ZVE32F-NEXT:    lw a7, 0(a4)
+; RV32ZVE32F-NEXT:    lw t0, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 352(sp)
+; RV32ZVE32F-NEXT:    lw a2, 356(sp)
+; RV32ZVE32F-NEXT:    lw a3, 360(sp)
+; RV32ZVE32F-NEXT:    lw a4, 364(sp)
+; RV32ZVE32F-NEXT:    lw t1, 0(a1)
+; RV32ZVE32F-NEXT:    sw t1, 112(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 104(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw s4, 0(a3)
 ; RV32ZVE32F-NEXT:    lw s5, 4(a3)
 ; RV32ZVE32F-NEXT:    lw s6, 0(a4)
 ; RV32ZVE32F-NEXT:    lw s7, 4(a4)
-; RV32ZVE32F-NEXT:    lw a2, 336(sp)
-; RV32ZVE32F-NEXT:    lw a4, 340(sp)
-; RV32ZVE32F-NEXT:    lw a5, 344(sp)
-; RV32ZVE32F-NEXT:    lw a6, 348(sp)
-; RV32ZVE32F-NEXT:    lw a7, 0(a2)
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw t1, 0(a4)
-; RV32ZVE32F-NEXT:    lw t2, 4(a4)
-; RV32ZVE32F-NEXT:    lw a1, 0(a5)
-; RV32ZVE32F-NEXT:    lw a2, 4(a5)
-; RV32ZVE32F-NEXT:    lw a3, 0(a6)
-; RV32ZVE32F-NEXT:    lw a4, 4(a6)
-; RV32ZVE32F-NEXT:    lw a5, 352(sp)
-; RV32ZVE32F-NEXT:    lw a6, 356(sp)
-; RV32ZVE32F-NEXT:    lw t3, 360(sp)
-; RV32ZVE32F-NEXT:    lw t4, 364(sp)
-; RV32ZVE32F-NEXT:    lw t5, 0(a5)
-; RV32ZVE32F-NEXT:    sw t5, 116(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 4(a5)
-; RV32ZVE32F-NEXT:    sw a5, 112(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 0(a6)
-; RV32ZVE32F-NEXT:    sw a5, 124(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 4(a6)
-; RV32ZVE32F-NEXT:    sw a5, 120(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw ra, 0(t3)
-; RV32ZVE32F-NEXT:    lw a5, 4(t3)
-; RV32ZVE32F-NEXT:    sw a5, 108(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 0(t4)
-; RV32ZVE32F-NEXT:    sw a5, 104(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 4(t4)
-; RV32ZVE32F-NEXT:    sw a5, 100(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 368(sp)
-; RV32ZVE32F-NEXT:    lw a6, 372(sp)
-; RV32ZVE32F-NEXT:    lw t3, 376(sp)
-; RV32ZVE32F-NEXT:    lw t4, 380(sp)
-; RV32ZVE32F-NEXT:    lw t5, 0(a5)
-; RV32ZVE32F-NEXT:    lw t6, 4(a5)
-; RV32ZVE32F-NEXT:    lw s2, 0(a6)
-; RV32ZVE32F-NEXT:    lw s3, 4(a6)
-; RV32ZVE32F-NEXT:    lw a5, 0(t3)
-; RV32ZVE32F-NEXT:    lw a6, 4(t3)
-; RV32ZVE32F-NEXT:    lw t3, 0(t4)
-; RV32ZVE32F-NEXT:    lw t4, 4(t4)
-; RV32ZVE32F-NEXT:    sw a1, 176(a0)
-; RV32ZVE32F-NEXT:    sw a2, 180(a0)
-; RV32ZVE32F-NEXT:    sw a3, 184(a0)
-; RV32ZVE32F-NEXT:    sw a4, 188(a0)
-; RV32ZVE32F-NEXT:    sw a7, 160(a0)
-; RV32ZVE32F-NEXT:    sw t0, 164(a0)
-; RV32ZVE32F-NEXT:    sw t1, 168(a0)
-; RV32ZVE32F-NEXT:    sw t2, 172(a0)
-; RV32ZVE32F-NEXT:    sw s4, 144(a0)
-; RV32ZVE32F-NEXT:    sw s5, 148(a0)
-; RV32ZVE32F-NEXT:    sw s6, 152(a0)
-; RV32ZVE32F-NEXT:    sw s7, 156(a0)
-; RV32ZVE32F-NEXT:    sw s8, 128(a0)
-; RV32ZVE32F-NEXT:    sw s9, 132(a0)
-; RV32ZVE32F-NEXT:    sw s10, 136(a0)
-; RV32ZVE32F-NEXT:    sw s11, 140(a0)
-; RV32ZVE32F-NEXT:    lw a1, 140(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 112(a0)
-; RV32ZVE32F-NEXT:    lw a1, 136(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 116(a0)
-; RV32ZVE32F-NEXT:    lw a1, 132(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 120(a0)
-; RV32ZVE32F-NEXT:    lw a1, 128(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 124(a0)
-; RV32ZVE32F-NEXT:    lw a1, 156(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 96(a0)
-; RV32ZVE32F-NEXT:    lw a1, 152(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 100(a0)
-; RV32ZVE32F-NEXT:    lw a1, 148(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 104(a0)
-; RV32ZVE32F-NEXT:    lw a1, 144(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 108(a0)
-; RV32ZVE32F-NEXT:    lw a1, 172(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 80(a0)
-; RV32ZVE32F-NEXT:    lw a1, 168(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 84(a0)
-; RV32ZVE32F-NEXT:    lw a1, 164(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 88(a0)
-; RV32ZVE32F-NEXT:    lw a1, 160(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 92(a0)
-; RV32ZVE32F-NEXT:    lw a1, 188(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 64(a0)
-; RV32ZVE32F-NEXT:    lw a1, 184(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 68(a0)
-; RV32ZVE32F-NEXT:    lw a1, 180(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 72(a0)
-; RV32ZVE32F-NEXT:    lw a1, 176(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 76(a0)
-; RV32ZVE32F-NEXT:    sw a5, 240(a0)
-; RV32ZVE32F-NEXT:    sw a6, 244(a0)
-; RV32ZVE32F-NEXT:    sw t3, 248(a0)
-; RV32ZVE32F-NEXT:    sw t4, 252(a0)
-; RV32ZVE32F-NEXT:    sw t5, 224(a0)
-; RV32ZVE32F-NEXT:    sw t6, 228(a0)
-; RV32ZVE32F-NEXT:    sw s2, 232(a0)
-; RV32ZVE32F-NEXT:    sw s3, 236(a0)
-; RV32ZVE32F-NEXT:    sw ra, 208(a0)
-; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 212(a0)
-; RV32ZVE32F-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 216(a0)
-; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 220(a0)
-; RV32ZVE32F-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 192(a0)
+; RV32ZVE32F-NEXT:    lw a1, 368(sp)
+; RV32ZVE32F-NEXT:    lw a2, 372(sp)
+; RV32ZVE32F-NEXT:    lw a3, 376(sp)
+; RV32ZVE32F-NEXT:    lw a4, 380(sp)
+; RV32ZVE32F-NEXT:    lw t1, 0(a1)
+; RV32ZVE32F-NEXT:    lw t2, 4(a1)
+; RV32ZVE32F-NEXT:    lw t3, 0(a2)
+; RV32ZVE32F-NEXT:    lw t4, 4(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 4(a3)
+; RV32ZVE32F-NEXT:    lw a3, 0(a4)
+; RV32ZVE32F-NEXT:    lw a4, 4(a4)
+; RV32ZVE32F-NEXT:    sw ra, 16(a0)
+; RV32ZVE32F-NEXT:    lw ra, 172(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 20(a0)
+; RV32ZVE32F-NEXT:    lw ra, 168(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 24(a0)
+; RV32ZVE32F-NEXT:    lw ra, 164(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 28(a0)
+; RV32ZVE32F-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 0(a0)
+; RV32ZVE32F-NEXT:    lw ra, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 4(a0)
+; RV32ZVE32F-NEXT:    lw ra, 196(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 8(a0)
+; RV32ZVE32F-NEXT:    lw ra, 192(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 12(a0)
+; RV32ZVE32F-NEXT:    lw ra, 188(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 48(a0)
+; RV32ZVE32F-NEXT:    lw ra, 184(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 52(a0)
+; RV32ZVE32F-NEXT:    lw ra, 180(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 56(a0)
+; RV32ZVE32F-NEXT:    lw ra, 176(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 60(a0)
+; RV32ZVE32F-NEXT:    sw a5, 176(a0)
+; RV32ZVE32F-NEXT:    sw a6, 180(a0)
+; RV32ZVE32F-NEXT:    sw a7, 184(a0)
+; RV32ZVE32F-NEXT:    sw t0, 188(a0)
+; RV32ZVE32F-NEXT:    sw t5, 160(a0)
+; RV32ZVE32F-NEXT:    sw t6, 164(a0)
+; RV32ZVE32F-NEXT:    sw s2, 168(a0)
+; RV32ZVE32F-NEXT:    sw s3, 172(a0)
+; RV32ZVE32F-NEXT:    sw s8, 144(a0)
+; RV32ZVE32F-NEXT:    sw s9, 148(a0)
+; RV32ZVE32F-NEXT:    sw s10, 152(a0)
+; RV32ZVE32F-NEXT:    sw s11, 156(a0)
+; RV32ZVE32F-NEXT:    lw a5, 128(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 128(a0)
+; RV32ZVE32F-NEXT:    lw a5, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 132(a0)
+; RV32ZVE32F-NEXT:    lw a5, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 136(a0)
+; RV32ZVE32F-NEXT:    lw a5, 116(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 140(a0)
+; RV32ZVE32F-NEXT:    lw a5, 144(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 112(a0)
+; RV32ZVE32F-NEXT:    lw a5, 140(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 116(a0)
+; RV32ZVE32F-NEXT:    lw a5, 136(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 120(a0)
+; RV32ZVE32F-NEXT:    lw a5, 132(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 124(a0)
+; RV32ZVE32F-NEXT:    lw a5, 160(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 96(a0)
+; RV32ZVE32F-NEXT:    lw a5, 156(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 100(a0)
+; RV32ZVE32F-NEXT:    lw a5, 152(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 104(a0)
+; RV32ZVE32F-NEXT:    lw a5, 148(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 108(a0)
+; RV32ZVE32F-NEXT:    lw a5, 212(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 80(a0)
+; RV32ZVE32F-NEXT:    lw a5, 208(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 84(a0)
+; RV32ZVE32F-NEXT:    lw a5, 204(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 88(a0)
+; RV32ZVE32F-NEXT:    lw a5, 200(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 92(a0)
+; RV32ZVE32F-NEXT:    lw a5, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 64(a0)
+; RV32ZVE32F-NEXT:    lw a5, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 68(a0)
+; RV32ZVE32F-NEXT:    lw a5, 220(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 72(a0)
+; RV32ZVE32F-NEXT:    lw a5, 216(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 76(a0)
+; RV32ZVE32F-NEXT:    sw a1, 240(a0)
+; RV32ZVE32F-NEXT:    sw a2, 244(a0)
+; RV32ZVE32F-NEXT:    sw a3, 248(a0)
+; RV32ZVE32F-NEXT:    sw a4, 252(a0)
+; RV32ZVE32F-NEXT:    sw t1, 224(a0)
+; RV32ZVE32F-NEXT:    sw t2, 228(a0)
+; RV32ZVE32F-NEXT:    sw t3, 232(a0)
+; RV32ZVE32F-NEXT:    sw t4, 236(a0)
+; RV32ZVE32F-NEXT:    sw s4, 208(a0)
+; RV32ZVE32F-NEXT:    sw s5, 212(a0)
+; RV32ZVE32F-NEXT:    sw s6, 216(a0)
+; RV32ZVE32F-NEXT:    sw s7, 220(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 192(a0)
+; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 196(a0)
-; RV32ZVE32F-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 200(a0)
-; RV32ZVE32F-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 204(a0)
-; RV32ZVE32F-NEXT:    lw a1, 220(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 16(a0)
-; RV32ZVE32F-NEXT:    lw a1, 216(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 20(a0)
-; RV32ZVE32F-NEXT:    lw a1, 212(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    lw a1, 208(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 252(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 0(a0)
-; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 12(a0)
-; RV32ZVE32F-NEXT:    lw a1, 204(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 48(a0)
-; RV32ZVE32F-NEXT:    lw a1, 200(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 52(a0)
-; RV32ZVE32F-NEXT:    lw a1, 196(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 56(a0)
-; RV32ZVE32F-NEXT:    lw a1, 192(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 60(a0)
-; RV32ZVE32F-NEXT:    lw a1, 236(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 32(a0)
-; RV32ZVE32F-NEXT:    lw a1, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 36(a0)
-; RV32ZVE32F-NEXT:    lw a1, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 40(a0)
-; RV32ZVE32F-NEXT:    lw a1, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 44(a0)
 ; RV32ZVE32F-NEXT:    addi sp, s0, -512
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa sp, 512
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index dbbec96445e3ea..f72b08a405246e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -278,12 +278,12 @@ define <64 x float> @masked_load_v64f32(ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <64 x float> @llvm.masked.load.v64f32(ptr %a, i32 8, <64 x i1> %mask, <64 x float> undef)
@@ -294,12 +294,12 @@ define <128 x bfloat> @masked_load_v128bf16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x bfloat> @llvm.masked.load.v128bf16(ptr %a, i32 8, <128 x i1> %mask, <128 x bfloat> undef)
@@ -310,12 +310,12 @@ define <128 x half> @masked_load_v128f16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x half> @llvm.masked.load.v128f16(ptr %a, i32 8, <128 x i1> %mask, <128 x half> undef)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index 4f3313f3760bee..e0cf39c75da240 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -276,12 +276,12 @@ define <64 x i32> @masked_load_v64i32(ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <64 x i32> @llvm.masked.load.v64i32(ptr %a, i32 8, <64 x i1> %mask, <64 x i32> undef)
@@ -303,12 +303,12 @@ define <128 x i16> @masked_load_v128i16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x i16> @llvm.masked.load.v128i16(ptr %a, i32 8, <128 x i1> %mask, <128 x i16> undef)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 476d023b9ad6ff..575a757149ebba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -349,12 +349,12 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vse8.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse8.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse8.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1))
@@ -867,12 +867,12 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
@@ -1744,12 +1744,12 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse32.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1))
@@ -3097,20 +3097,20 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV32ZVE32F-NEXT:    lw a0, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v8
-; RV32ZVE32F-NEXT:    sw a5, 0(t0)
-; RV32ZVE32F-NEXT:    sw a6, 4(t0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    sw a7, 0(a5)
-; RV32ZVE32F-NEXT:    sw a0, 4(a5)
+; RV32ZVE32F-NEXT:    vmv.x.s t1, v9
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a1, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(t0)
+; RV32ZVE32F-NEXT:    sw a6, 4(t0)
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a6, v8
+; RV32ZVE32F-NEXT:    sw a7, 0(t1)
+; RV32ZVE32F-NEXT:    sw a0, 4(t1)
+; RV32ZVE32F-NEXT:    sw a1, 0(a5)
+; RV32ZVE32F-NEXT:    sw a2, 4(a5)
+; RV32ZVE32F-NEXT:    sw a3, 0(a6)
+; RV32ZVE32F-NEXT:    sw a4, 4(a6)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
@@ -5693,6 +5693,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
@@ -5702,6 +5703,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
 ; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
+; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
 ; RV32ZVE32F-NEXT:    .cfi_remember_state
 ; RV32ZVE32F-NEXT:    lw a3, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 60(a0)
@@ -5724,21 +5726,21 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw s6, 0(a2)
 ; RV32ZVE32F-NEXT:    lw s7, 8(a2)
 ; RV32ZVE32F-NEXT:    lw s8, 16(a2)
-; RV32ZVE32F-NEXT:    lw a2, 24(a2)
+; RV32ZVE32F-NEXT:    lw s9, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, s6
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s9
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    andi s2, a2, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB51_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -5778,6 +5780,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    .cfi_restore s0
 ; RV32ZVE32F-NEXT:    .cfi_restore s1
 ; RV32ZVE32F-NEXT:    .cfi_restore s2
@@ -5787,6 +5790,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_restore s6
 ; RV32ZVE32F-NEXT:    .cfi_restore s7
 ; RV32ZVE32F-NEXT:    .cfi_restore s8
+; RV32ZVE32F-NEXT:    .cfi_restore s9
 ; RV32ZVE32F-NEXT:    addi sp, sp, 48
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZVE32F-NEXT:    ret
@@ -6146,19 +6150,19 @@ define void @mscatter_truemask_v4bf16(<4 x bfloat> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v9
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a1)
-; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
-; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a3)
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-NEXT:    fmv.h.x fa5, a4
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
@@ -6318,10 +6322,10 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB58_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_2: # %else
@@ -6331,11 +6335,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_4: # %else2
@@ -6358,11 +6362,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_9: # %else10
@@ -6377,11 +6381,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB58_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6390,11 +6394,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6402,11 +6406,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:  .LBB58_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6414,11 +6418,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    j .LBB58_9
 ; RV64ZVE32F-NEXT:  .LBB58_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6426,11 +6430,13 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:  .LBB58_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -6467,10 +6473,10 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB59_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_2: # %else
@@ -6480,11 +6486,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_4: # %else2
@@ -6507,11 +6513,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_9: # %else10
@@ -6526,11 +6532,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB59_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6539,11 +6545,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6551,11 +6557,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB59_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6563,11 +6569,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    j .LBB59_9
 ; RV64ZVE32F-NEXT:  .LBB59_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6575,11 +6581,13 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB59_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -6615,11 +6623,11 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB60_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
-; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_2: # %else
@@ -6629,12 +6637,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_4: # %else2
@@ -6657,12 +6665,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_9: # %else10
@@ -6677,12 +6685,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB60_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6691,12 +6699,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6704,12 +6712,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB60_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6717,12 +6725,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    j .LBB60_9
 ; RV64ZVE32F-NEXT:  .LBB60_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6730,12 +6738,14 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB60_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    andi a1, a1, 255
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -6772,9 +6782,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_2: # %else
@@ -6784,11 +6794,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_4: # %else2
@@ -6811,11 +6821,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_9: # %else10
@@ -6830,11 +6840,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB61_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6843,11 +6853,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6855,10 +6865,10 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  .LBB61_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6866,11 +6876,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    j .LBB61_9
 ; RV64ZVE32F-NEXT:  .LBB61_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6878,11 +6888,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  .LBB61_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -7146,12 +7156,12 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-ZVFH-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-ZVFH-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-ZVFH-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-ZVFH-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v8, (a0)
 ; RV64ZVE32F-ZVFH-NEXT:    ret
 ;
@@ -7163,19 +7173,19 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a4, v8
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a4, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a1)
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a4, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a3)
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
-; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
   call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
@@ -7529,10 +7539,10 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB68_2
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_2: # %else
@@ -7542,11 +7552,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_4: # %else2
@@ -7569,11 +7579,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_9: # %else10
@@ -7588,11 +7598,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -7601,11 +7611,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -7613,11 +7623,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -7625,11 +7635,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB68_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -7637,11 +7647,13 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -7788,10 +7800,10 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB69_2
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_2: # %else
@@ -7801,11 +7813,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_4: # %else2
@@ -7828,11 +7840,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_9: # %else10
@@ -7847,11 +7859,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -7860,11 +7872,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -7872,11 +7884,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -7884,11 +7896,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB69_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -7896,11 +7908,13 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -8054,11 +8068,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB70_2
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_2: # %else
@@ -8068,12 +8082,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_4: # %else2
@@ -8096,12 +8110,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_9: # %else10
@@ -8116,12 +8130,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8130,12 +8144,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -8143,12 +8157,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -8156,12 +8170,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB70_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -8169,12 +8183,14 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -8320,9 +8336,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_2: # %else
@@ -8332,11 +8348,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_4: # %else2
@@ -8359,11 +8375,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_9: # %else10
@@ -8378,11 +8394,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8391,11 +8407,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -8403,10 +8419,10 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -8414,11 +8430,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB71_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -8426,11 +8442,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -8603,12 +8619,12 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse32.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1))
@@ -9925,14 +9941,14 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x ptr> %ptrs) {
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
 ; RV32ZVE32F-NEXT:    fsd fa2, 0(a0)
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    fsd fa3, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
@@ -10153,11 +10169,11 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8>
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB91_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10353,11 +10369,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB92_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10555,11 +10571,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB93_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10764,11 +10780,11 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB94_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10965,11 +10981,11 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB95_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11168,11 +11184,11 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB96_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11991,21 +12007,21 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ; RV32ZVE32F-NEXT:    lw a6, 0(a1)
 ; RV32ZVE32F-NEXT:    lw a7, 8(a1)
 ; RV32ZVE32F-NEXT:    lw t0, 16(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw t1, 24(a1)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, a6
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
 ; RV32ZVE32F-NEXT:    andi a2, a1, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB100_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12902,8 +12918,8 @@ define void @mscatter_shuffle_rotate(<8 x i16> %val, ptr %base) {
 ; RV64ZVE32F-NEXT:    addi a6, a0, 10
 ; RV64ZVE32F-NEXT:    addi a7, a0, 8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse16.v v8, (a7)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse16.v v8, (a7)
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a6)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a5)
@@ -12914,8 +12930,8 @@ define void @mscatter_shuffle_rotate(<8 x i16> %val, ptr %base) {
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 6
-; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 4, i64 5, i64 6, i64 7, i64 0, i64 1, i64 2, i64 3>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
index f7e311d06c03a1..ed6ec4d5659b15 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
@@ -278,12 +278,12 @@ define void @masked_store_v64f32(<64 x float> %val, ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v64f32.p0(<64 x float> %val, ptr %a, i32 8, <64 x i1> %mask)
@@ -294,12 +294,12 @@ define void @masked_store_v128bf16(<128 x bfloat> %val, ptr %a, <128 x i1> %mask
 ; CHECK-LABEL: masked_store_v128bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128bf16.p0(<128 x bfloat> %val, ptr %a, i32 8, <128 x i1> %mask)
@@ -310,12 +310,12 @@ define void @masked_store_v128f16(<128 x half> %val, ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128f16.p0(<128 x half> %val, ptr %a, i32 8, <128 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index 0c9bf9a09fd6d3..c3b10db115bae5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -276,12 +276,12 @@ define void @masked_store_v64i32(<64 x i32> %val, ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v64i32.p0(<64 x i32> %val, ptr %a, i32 8, <64 x i1> %mask)
@@ -303,12 +303,12 @@ define void @masked_store_v128i16(<128 x i16> %val, ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v128i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128i16.p0(<128 x i16> %val, ptr %a, i32 8, <128 x i1> %mask)
@@ -321,10 +321,10 @@ define void @masked_store_v256i8(<256 x i8> %val, ptr %a, <256 x i1> %mask) {
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a1)
+; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vse8.v v8, (a0), v0.t
-; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vse8.v v16, (a0), v0.t
+; CHECK-NEXT:    vse8.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v256i8.p0(<256 x i8> %val, ptr %a, i32 8, <256 x i1> %mask)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index c47b79a2df92c5..46c2033d28b387 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -578,6 +578,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
@@ -589,14 +595,18 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
@@ -610,6 +620,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
@@ -624,30 +640,33 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    frflags a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a1
+; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    frflags a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index a8798474d669ae..4f0f5dd78c94b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -437,8 +437,8 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v9
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a2, v9
 ; RV32-NEXT:    vmv.x.s a3, v8
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a0, a0, a3
@@ -452,8 +452,8 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v9
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vmv.x.s a2, v9
 ; RV64-NEXT:    vmv.x.s a3, v8
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    add a0, a0, a3
@@ -856,13 +856,13 @@ define float @reduce_fadd_4xi32_non_associative(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vfmv.f.s fa5, v9
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vfredusum.vs v8, v8, v9
-; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vfredusum.vs v9, v8, v9
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    fadd.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %p, align 256
@@ -887,8 +887,8 @@ define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 1
 ; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa3, v9
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vfmv.f.s fa3, v9
 ; CHECK-NEXT:    vfmv.f.s fa2, v8
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 4b30725f973c7b..8bf30f8f0d072b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -797,13 +797,13 @@ define float @vreduce_ord_fwadd_v64f32(ptr %x, float %s) {
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v12, fa0
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v24, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwredosum.vs v8, v8, v12
-; CHECK-NEXT:    vfwredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfwredosum.vs v8, v8, v16
+; CHECK-NEXT:    vfwredosum.vs v8, v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1194,13 +1194,13 @@ define double @vreduce_ord_fwadd_v32f64(ptr %x, double %s) {
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v8, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v12, fa0
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v8, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vfwredosum.vs v8, v8, v12
-; CHECK-NEXT:    vfwredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfwredosum.vs v8, v8, v16
+; CHECK-NEXT:    vfwredosum.vs v8, v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1364,17 +1364,17 @@ define float @vreduce_fmin_v128f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmin_v128f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfmin.vv v8, v24, v8
+; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1611,17 +1611,17 @@ define float @vreduce_fmax_v128f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmax_v128f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfmax.vv v8, v24, v8
+; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2013,80 +2013,61 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -2101,10 +2082,7 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB121_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2119,17 +2097,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v128f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfmin.vv v8, v24, v8
+; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2353,79 +2331,60 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -2440,10 +2399,7 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB133_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2771,80 +2727,61 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -2859,10 +2796,7 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB149_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2877,17 +2811,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfmax.vv v8, v24, v8
+; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -3111,79 +3045,60 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -3198,10 +3113,7 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB161_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 71dc75dcc96c2a..f920e39e7d295c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -811,9 +811,9 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
 ; CHECK-NEXT:  .LBB49_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    addi a0, a1, -32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    addi a0, a1, -32
 ; CHECK-NEXT:    sltu a1, a1, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
@@ -838,10 +838,10 @@ define signext i64 @vpreduce_add_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -873,10 +873,10 @@ define signext i64 @vpreduce_umax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -908,10 +908,10 @@ define signext i64 @vpreduce_smax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmax.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -943,10 +943,10 @@ define signext i64 @vpreduce_umin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -978,10 +978,10 @@ define signext i64 @vpreduce_smin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmin.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1013,10 +1013,10 @@ define signext i64 @vpreduce_and_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredand.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1048,10 +1048,10 @@ define signext i64 @vpreduce_or_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredor.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1083,10 +1083,10 @@ define signext i64 @vpreduce_xor_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredxor.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1118,10 +1118,10 @@ define signext i64 @vpreduce_add_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredsum.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1153,10 +1153,10 @@ define signext i64 @vpreduce_umax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1188,10 +1188,10 @@ define signext i64 @vpreduce_smax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmax.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1223,10 +1223,10 @@ define signext i64 @vpreduce_umin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1258,10 +1258,10 @@ define signext i64 @vpreduce_smin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmin.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1293,10 +1293,10 @@ define signext i64 @vpreduce_and_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredand.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1328,10 +1328,10 @@ define signext i64 @vpreduce_or_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredor.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1363,10 +1363,10 @@ define signext i64 @vpreduce_xor_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredxor.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1797,12 +1797,12 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV32-NEXT:    vle8.v v12, (a3)
 ; RV32-NEXT:    vid.v v16
 ; RV32-NEXT:    vmsltu.vx v14, v16, a1
+; RV32-NEXT:    li a3, 64
 ; RV32-NEXT:    vsext.vf4 v16, v12
 ; RV32-NEXT:    vmsltu.vx v12, v16, a1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vslideup.vi v14, v12, 4
-; RV32-NEXT:    li a1, 64
-; RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
 ; RV32-NEXT:    vmand.mm v0, v14, v0
 ; RV32-NEXT:    vmv.v.i v12, 1
 ; RV32-NEXT:    vmerge.vvm v8, v12, v8, v0
@@ -1843,12 +1843,12 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV64-NEXT:    vle8.v v12, (a3)
 ; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    vmsltu.vx v14, v16, a1
+; RV64-NEXT:    li a3, 64
 ; RV64-NEXT:    vsext.vf4 v16, v12
 ; RV64-NEXT:    vmsltu.vx v12, v16, a1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 4
-; RV64-NEXT:    li a1, 64
-; RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
 ; RV64-NEXT:    vmand.mm v0, v14, v0
 ; RV64-NEXT:    vmv.v.i v12, 1
 ; RV64-NEXT:    vmerge.vvm v8, v12, v8, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 9f674ea6dfd607..2ea618bf8a2260 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -967,8 +967,8 @@ define i64 @vwreduce_add_v1i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsext.vf2 v9, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsext.vf2 v9, v8
 ; RV32-NEXT:    vsrl.vx v8, v9, a0
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a0, v9
@@ -992,8 +992,8 @@ define i64 @vwreduce_uadd_v1i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vzext.vf2 v9, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vzext.vf2 v9, v8
 ; RV32-NEXT:    vsrl.vx v8, v9, a0
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a0, v9
@@ -1020,9 +1020,9 @@ define i64 @vreduce_add_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1047,11 +1047,11 @@ define i64 @vwreduce_add_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1078,11 +1078,11 @@ define i64 @vwreduce_uadd_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1111,9 +1111,9 @@ define i64 @vreduce_add_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1138,11 +1138,11 @@ define i64 @vwreduce_add_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1169,11 +1169,11 @@ define i64 @vwreduce_uadd_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1202,9 +1202,9 @@ define i64 @vreduce_add_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1229,11 +1229,11 @@ define i64 @vwreduce_add_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1260,11 +1260,11 @@ define i64 @vwreduce_uadd_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1293,9 +1293,9 @@ define i64 @vreduce_add_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1320,11 +1320,11 @@ define i64 @vwreduce_add_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1351,11 +1351,11 @@ define i64 @vwreduce_uadd_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1387,9 +1387,9 @@ define i64 @vreduce_add_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v16, (a0)
 ; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1494,21 +1494,21 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
 define i64 @vreduce_add_v64i64(ptr %x) nounwind {
 ; RV32-LABEL: vreduce_add_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
-; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 256
+; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    vadd.vv v24, v0, v24
+; RV32-NEXT:    vmv.s.x v7, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    vmv.s.x v16, zero
-; RV32-NEXT:    vredsum.vs v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vredsum.vs v8, v8, v7
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1539,51 +1539,27 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind {
 define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV32-LABEL: vwreduce_add_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v16, (a0)
+; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v8, 16
 ; RV32-NEXT:    vslidedown.vi v0, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwadd.vv v8, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vwadd.vv v0, v8, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwadd.vv v24, v16, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vwadd.vv v16, v0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vadd.vv v8, v24, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add sp, sp, a2
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwreduce_add_v64i64:
@@ -1591,41 +1567,30 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vslidedown.vi v0, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vwadd.vv v8, v24, v0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vwadd.vv v0, v8, v16
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwadd.vv v24, v8, v16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwadd.vv v8, v16, v0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vredsum.vs v8, v8, v16
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add sp, sp, a1
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -1640,51 +1605,27 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV32-LABEL: vwreduce_uadd_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v16, (a0)
+; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v8, 16
 ; RV32-NEXT:    vslidedown.vi v0, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwaddu.vv v8, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vwaddu.vv v0, v8, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwaddu.vv v24, v16, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vwaddu.vv v16, v0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vadd.vv v8, v24, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add sp, sp, a2
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwreduce_uadd_v64i64:
@@ -1692,41 +1633,30 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vslidedown.vi v0, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vwaddu.vv v8, v24, v0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vwaddu.vv v0, v8, v16
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwaddu.vv v24, v8, v16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwaddu.vv v8, v16, v0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vredsum.vs v8, v8, v16
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add sp, sp, a1
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -2162,8 +2092,8 @@ define i64 @vreduce_and_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -2189,9 +2119,9 @@ define i64 @vreduce_and_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2216,9 +2146,9 @@ define i64 @vreduce_and_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2243,9 +2173,9 @@ define i64 @vreduce_and_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2272,10 +2202,10 @@ define i64 @vreduce_and_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2307,14 +2237,14 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2763,8 +2693,8 @@ define i64 @vreduce_or_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -2790,9 +2720,9 @@ define i64 @vreduce_or_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2817,9 +2747,9 @@ define i64 @vreduce_or_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2844,9 +2774,9 @@ define i64 @vreduce_or_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2873,10 +2803,10 @@ define i64 @vreduce_or_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2908,14 +2838,14 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3386,9 +3316,9 @@ define i64 @vreduce_xor_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3415,9 +3345,9 @@ define i64 @vreduce_xor_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3444,9 +3374,9 @@ define i64 @vreduce_xor_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3473,9 +3403,9 @@ define i64 @vreduce_xor_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3505,9 +3435,9 @@ define i64 @vreduce_xor_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v16, (a0)
 ; RV32-NEXT:    vxor.vv v8, v8, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3534,21 +3464,21 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
 define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
 ; RV32-LABEL: vreduce_xor_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
-; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vxor.vv v16, v24, v16
-; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 256
+; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    vmv.s.x v7, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vxor.vv v8, v8, v16
-; RV32-NEXT:    vmv.s.x v16, zero
-; RV32-NEXT:    vredxor.vs v8, v8, v16
+; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vredxor.vs v8, v8, v7
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3999,8 +3929,8 @@ define i64 @vreduce_smin_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -4026,9 +3956,9 @@ define i64 @vreduce_smin_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4053,9 +3983,9 @@ define i64 @vreduce_smin_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4080,9 +4010,9 @@ define i64 @vreduce_smin_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4109,10 +4039,10 @@ define i64 @vreduce_smin_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4144,14 +4074,14 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmin.vv v16, v24, v16
-; RV32-NEXT:    vmin.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmin.vv v16, v0, v16
+; RV32-NEXT:    vmin.vv v8, v8, v24
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4601,8 +4531,8 @@ define i64 @vreduce_smax_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -4628,9 +4558,9 @@ define i64 @vreduce_smax_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4655,9 +4585,9 @@ define i64 @vreduce_smax_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4682,9 +4612,9 @@ define i64 @vreduce_smax_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4711,10 +4641,10 @@ define i64 @vreduce_smax_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4746,14 +4676,14 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmax.vv v16, v24, v16
-; RV32-NEXT:    vmax.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmax.vv v16, v0, v16
+; RV32-NEXT:    vmax.vv v8, v8, v24
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5203,8 +5133,8 @@ define i64 @vreduce_umin_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -5230,9 +5160,9 @@ define i64 @vreduce_umin_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5257,9 +5187,9 @@ define i64 @vreduce_umin_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5284,9 +5214,9 @@ define i64 @vreduce_umin_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5313,10 +5243,10 @@ define i64 @vreduce_umin_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5348,14 +5278,14 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vminu.vv v16, v24, v16
-; RV32-NEXT:    vminu.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vminu.vv v16, v0, v16
+; RV32-NEXT:    vminu.vv v8, v8, v24
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5804,8 +5734,8 @@ define i64 @vreduce_umax_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -5831,9 +5761,9 @@ define i64 @vreduce_umax_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5858,9 +5788,9 @@ define i64 @vreduce_umax_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5885,9 +5815,9 @@ define i64 @vreduce_umax_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5914,10 +5844,10 @@ define i64 @vreduce_umax_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5949,14 +5879,14 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmaxu.vv v16, v24, v16
-; RV32-NEXT:    vmaxu.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmaxu.vv v16, v0, v16
+; RV32-NEXT:    vmaxu.vv v8, v8, v24
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6191,8 +6121,8 @@ define i8 @vreduce_mul_v256i8(ptr %x) {
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    vmul.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vmul.vv v8, v8, v16
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vmul.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 32
@@ -6565,9 +6495,9 @@ define i64 @vreduce_mul_v2i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 8
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6593,12 +6523,12 @@ define i64 @vreduce_mul_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    vrgather.vi v10, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6626,6 +6556,7 @@ define i64 @vreduce_mul_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
@@ -6633,7 +6564,6 @@ define i64 @vreduce_mul_v8i64(ptr %x) {
 ; RV32-NEXT:    vrgather.vi v12, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6663,6 +6593,7 @@ define i64 @vreduce_mul_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
@@ -6672,7 +6603,6 @@ define i64 @vreduce_mul_v16i64(ptr %x) {
 ; RV32-NEXT:    vrgather.vi v16, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index 483fad54203f99..b8617fda3aa7ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -533,25 +533,26 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
@@ -583,15 +584,15 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    addi a2, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v7, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index a2dc0181c56c4d..820a05e3d6042b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -72,10 +72,10 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -154,10 +154,10 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -238,10 +238,10 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -283,10 +283,10 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -324,10 +324,10 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 4
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 4
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 4
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 83c4723d00cb76..8391c7939180a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -72,10 +72,10 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -154,10 +154,10 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -238,10 +238,10 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -283,10 +283,10 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -324,10 +324,10 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 35d1e59a5a3798..8c38d244602655 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -72,10 +72,10 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -154,10 +154,10 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -238,10 +238,10 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -283,10 +283,10 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -324,10 +324,10 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
index 80561be0ca2f5f..8da605d35270de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
@@ -102,37 +102,37 @@ define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    vle8.v v11, (a1)
-; CHECK-NEXT:    vminu.vv v12, v8, v9
-; CHECK-NEXT:    vmaxu.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vminu.vv v9, v10, v11
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    vle8.v v13, (a1)
-; CHECK-NEXT:    vmaxu.vv v10, v10, v11
-; CHECK-NEXT:    vsub.vv v9, v10, v9
-; CHECK-NEXT:    vwaddu.vv v10, v9, v8
-; CHECK-NEXT:    vminu.vv v8, v12, v13
-; CHECK-NEXT:    vmaxu.vv v9, v12, v13
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    vle8.v v12, (a0)
-; CHECK-NEXT:    vle8.v v13, (a1)
-; CHECK-NEXT:    vsub.vv v8, v9, v8
+; CHECK-NEXT:    vminu.vv v14, v8, v9
+; CHECK-NEXT:    vmaxu.vv v8, v8, v9
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsub.vv v8, v8, v14
+; CHECK-NEXT:    vminu.vv v14, v10, v11
+; CHECK-NEXT:    vmaxu.vv v10, v10, v11
+; CHECK-NEXT:    vle8.v v11, (a1)
+; CHECK-NEXT:    vsub.vv v10, v10, v14
+; CHECK-NEXT:    vminu.vv v14, v12, v13
+; CHECK-NEXT:    vmaxu.vv v12, v12, v13
+; CHECK-NEXT:    vwaddu.vv v16, v10, v8
+; CHECK-NEXT:    vsub.vv v8, v12, v14
+; CHECK-NEXT:    vminu.vv v10, v9, v11
+; CHECK-NEXT:    vmaxu.vv v9, v9, v11
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v14, v8
-; CHECK-NEXT:    vwaddu.vv v16, v14, v10
+; CHECK-NEXT:    vzext.vf2 v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vminu.vv v8, v12, v13
-; CHECK-NEXT:    vmaxu.vv v9, v12, v13
-; CHECK-NEXT:    vsub.vv v8, v9, v8
+; CHECK-NEXT:    vsub.vv v8, v9, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwaddu.vv v20, v12, v16
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vwaddu.wv v16, v16, v10
+; CHECK-NEXT:    vwaddu.wv v20, v20, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vredsum.vs v8, v16, v8
+; CHECK-NEXT:    vredsum.vs v8, v20, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
index 4621f339ca8828..6b81b781a898f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
@@ -14,33 +14,31 @@ define <8 x float> @fpext_v8bf16(<8 x bfloat> %x) {
 ; CHECK-NEXT:    fmv.x.w a6, fa6
 ; CHECK-NEXT:    fmv.x.w a7, fa7
 ; CHECK-NEXT:    slli a7, a7, 16
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a7
 ; CHECK-NEXT:    slli a6, a6, 16
-; CHECK-NEXT:    vmv.s.x v9, a6
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a5, a5, 16
-; CHECK-NEXT:    vmv.s.x v8, a5
 ; CHECK-NEXT:    slli a4, a4, 16
-; CHECK-NEXT:    vmv.s.x v10, a4
-; CHECK-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v9, 2
 ; CHECK-NEXT:    slli a3, a3, 16
-; CHECK-NEXT:    vmv.s.x v8, a3
 ; CHECK-NEXT:    slli a2, a2, 16
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a1, a1, 16
-; CHECK-NEXT:    vmv.s.x v11, a1
 ; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a7
+; CHECK-NEXT:    vmv.s.x v9, a6
+; CHECK-NEXT:    vmv.s.x v10, a5
+; CHECK-NEXT:    vmv.s.x v12, a4
+; CHECK-NEXT:    vmv.s.x v11, a3
+; CHECK-NEXT:    vmv.s.x v13, a2
+; CHECK-NEXT:    vslideup.vi v9, v8, 1
+; CHECK-NEXT:    vmv.s.x v14, a1
+; CHECK-NEXT:    vslideup.vi v12, v10, 1
+; CHECK-NEXT:    vslideup.vi v13, v11, 1
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vslideup.vi v8, v11, 1
+; CHECK-NEXT:    vslideup.vi v8, v14, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v12, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v13, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    ret
   %y = fpext <8 x bfloat> %x to <8 x float>
   ret <8 x float> %y
@@ -58,33 +56,31 @@ define <8 x float> @fpext_v8f16(<8 x bfloat> %x) {
 ; CHECK-NEXT:    fmv.x.w a6, fa6
 ; CHECK-NEXT:    fmv.x.w a7, fa7
 ; CHECK-NEXT:    slli a7, a7, 16
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a7
 ; CHECK-NEXT:    slli a6, a6, 16
-; CHECK-NEXT:    vmv.s.x v9, a6
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a5, a5, 16
-; CHECK-NEXT:    vmv.s.x v8, a5
 ; CHECK-NEXT:    slli a4, a4, 16
-; CHECK-NEXT:    vmv.s.x v10, a4
-; CHECK-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v9, 2
 ; CHECK-NEXT:    slli a3, a3, 16
-; CHECK-NEXT:    vmv.s.x v8, a3
 ; CHECK-NEXT:    slli a2, a2, 16
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a1, a1, 16
-; CHECK-NEXT:    vmv.s.x v11, a1
 ; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a7
+; CHECK-NEXT:    vmv.s.x v9, a6
+; CHECK-NEXT:    vmv.s.x v10, a5
+; CHECK-NEXT:    vmv.s.x v12, a4
+; CHECK-NEXT:    vmv.s.x v11, a3
+; CHECK-NEXT:    vmv.s.x v13, a2
+; CHECK-NEXT:    vslideup.vi v9, v8, 1
+; CHECK-NEXT:    vmv.s.x v14, a1
+; CHECK-NEXT:    vslideup.vi v12, v10, 1
+; CHECK-NEXT:    vslideup.vi v13, v11, 1
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vslideup.vi v8, v11, 1
+; CHECK-NEXT:    vslideup.vi v8, v14, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v12, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v13, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    ret
   %y = fpext <8 x bfloat> %x to <8 x float>
   ret <8 x float> %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 62ef8cbdb19163..03d5762b4903ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -411,9 +411,9 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v10, v0.t
@@ -438,9 +438,9 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v12, v0.t
@@ -492,15 +492,15 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -523,15 +523,15 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -575,9 +575,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v10, v0.t
@@ -602,9 +602,9 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v12, v0.t
@@ -1008,15 +1008,15 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1039,15 +1039,15 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1114,1757 +1114,2269 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ;
 ; ZVFHMIN32-LABEL: fcmp_oeq_vv_v128f16:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    addi sp, sp, -768
-; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 768
-; ZVFHMIN32-NEXT:    sw ra, 764(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s0, 760(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s2, 756(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s3, 752(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    addi sp, sp, -896
+; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 896
+; ZVFHMIN32-NEXT:    sw ra, 892(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s0, 888(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s2, 884(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s3, 880(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s4, 876(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s5, 872(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s6, 868(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s7, 864(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s8, 860(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s9, 856(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s10, 852(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s11, 848(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    .cfi_offset ra, -4
 ; ZVFHMIN32-NEXT:    .cfi_offset s0, -8
 ; ZVFHMIN32-NEXT:    .cfi_offset s2, -12
 ; ZVFHMIN32-NEXT:    .cfi_offset s3, -16
-; ZVFHMIN32-NEXT:    addi s0, sp, 768
+; ZVFHMIN32-NEXT:    .cfi_offset s4, -20
+; ZVFHMIN32-NEXT:    .cfi_offset s5, -24
+; ZVFHMIN32-NEXT:    .cfi_offset s6, -28
+; ZVFHMIN32-NEXT:    .cfi_offset s7, -32
+; ZVFHMIN32-NEXT:    .cfi_offset s8, -36
+; ZVFHMIN32-NEXT:    .cfi_offset s9, -40
+; ZVFHMIN32-NEXT:    .cfi_offset s10, -44
+; ZVFHMIN32-NEXT:    .cfi_offset s11, -48
+; ZVFHMIN32-NEXT:    addi s0, sp, 896
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa s0, 0
+; ZVFHMIN32-NEXT:    csrr a1, vlenb
+; ZVFHMIN32-NEXT:    li a2, 30
+; ZVFHMIN32-NEXT:    mul a1, a1, a2
+; ZVFHMIN32-NEXT:    sub sp, sp, a1
 ; ZVFHMIN32-NEXT:    andi sp, sp, -128
 ; ZVFHMIN32-NEXT:    addi a1, a0, 128
 ; ZVFHMIN32-NEXT:    li a2, 64
+; ZVFHMIN32-NEXT:    addi a3, sp, 640
+; ZVFHMIN32-NEXT:    addi a4, sp, 384
+; ZVFHMIN32-NEXT:    addi a5, sp, 512
 ; ZVFHMIN32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle16.v v24, (a1)
 ; ZVFHMIN32-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 512
-; ZVFHMIN32-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN32-NEXT:    addi a0, sp, 256
-; ZVFHMIN32-NEXT:    vse16.v v0, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 384
-; ZVFHMIN32-NEXT:    vse16.v v16, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 128
+; ZVFHMIN32-NEXT:    vle16.v v24, (a1)
+; ZVFHMIN32-NEXT:    vse16.v v8, (a3)
+; ZVFHMIN32-NEXT:    vse16.v v0, (a4)
+; ZVFHMIN32-NEXT:    vse16.v v16, (a5)
 ; ZVFHMIN32-NEXT:    vse16.v v24, (a0)
+; ZVFHMIN32-NEXT:    lh a0, 704(sp)
+; ZVFHMIN32-NEXT:    lh a1, 448(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 160(sp)
+; ZVFHMIN32-NEXT:    lh a0, 702(sp)
+; ZVFHMIN32-NEXT:    lh a1, 446(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 159(sp)
+; ZVFHMIN32-NEXT:    lh a0, 700(sp)
+; ZVFHMIN32-NEXT:    lh a1, 444(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 158(sp)
+; ZVFHMIN32-NEXT:    lh a0, 698(sp)
+; ZVFHMIN32-NEXT:    lh a1, 442(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 157(sp)
+; ZVFHMIN32-NEXT:    lh a0, 696(sp)
+; ZVFHMIN32-NEXT:    lh a1, 440(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 156(sp)
+; ZVFHMIN32-NEXT:    lh a0, 694(sp)
+; ZVFHMIN32-NEXT:    lh a1, 438(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 155(sp)
+; ZVFHMIN32-NEXT:    lh a0, 692(sp)
+; ZVFHMIN32-NEXT:    lh a1, 436(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 154(sp)
+; ZVFHMIN32-NEXT:    lh a0, 690(sp)
+; ZVFHMIN32-NEXT:    lh a1, 434(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 153(sp)
+; ZVFHMIN32-NEXT:    lh a0, 688(sp)
+; ZVFHMIN32-NEXT:    lh a1, 432(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 152(sp)
+; ZVFHMIN32-NEXT:    lh a0, 686(sp)
+; ZVFHMIN32-NEXT:    lh a1, 430(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 151(sp)
+; ZVFHMIN32-NEXT:    lh a0, 684(sp)
+; ZVFHMIN32-NEXT:    lh a1, 428(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 150(sp)
+; ZVFHMIN32-NEXT:    lh a0, 682(sp)
+; ZVFHMIN32-NEXT:    lh a1, 426(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 149(sp)
+; ZVFHMIN32-NEXT:    lh a0, 680(sp)
+; ZVFHMIN32-NEXT:    lh a1, 424(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 148(sp)
+; ZVFHMIN32-NEXT:    lh a0, 678(sp)
+; ZVFHMIN32-NEXT:    lh a1, 422(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 147(sp)
+; ZVFHMIN32-NEXT:    lh a0, 676(sp)
+; ZVFHMIN32-NEXT:    lh a1, 420(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 146(sp)
+; ZVFHMIN32-NEXT:    lh a0, 674(sp)
+; ZVFHMIN32-NEXT:    lh a1, 418(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a0, 145(sp)
+; ZVFHMIN32-NEXT:    lh a0, 672(sp)
+; ZVFHMIN32-NEXT:    lh a1, 416(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 128(sp)
+; ZVFHMIN32-NEXT:    sb a0, 144(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 576(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 320(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 32(sp)
+; ZVFHMIN32-NEXT:    sb a0, 224(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 574(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 318(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 31(sp)
+; ZVFHMIN32-NEXT:    sb a0, 223(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 572(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 316(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 30(sp)
+; ZVFHMIN32-NEXT:    sb a0, 222(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 570(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 314(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 29(sp)
+; ZVFHMIN32-NEXT:    sb a0, 221(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 568(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 312(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 28(sp)
+; ZVFHMIN32-NEXT:    sb a0, 220(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 566(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 310(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 27(sp)
+; ZVFHMIN32-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 308(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 26(sp)
+; ZVFHMIN32-NEXT:    sb a0, 218(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 562(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 306(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 29
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 28
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 25
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 25(sp)
+; ZVFHMIN32-NEXT:    sb a0, 217(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 304(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 20
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 24(sp)
+; ZVFHMIN32-NEXT:    sb a0, 216(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 302(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 6
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 10
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 23(sp)
+; ZVFHMIN32-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 22(sp)
+; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 21(sp)
+; ZVFHMIN32-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 20(sp)
+; ZVFHMIN32-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v22
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 19(sp)
-; ZVFHMIN32-NEXT:    lh a0, 548(sp)
-; ZVFHMIN32-NEXT:    lh a1, 292(sp)
+; ZVFHMIN32-NEXT:    sb a0, 211(sp)
+; ZVFHMIN32-NEXT:    lh a1, 548(sp)
+; ZVFHMIN32-NEXT:    lh t5, 292(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v14
+; ZVFHMIN32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    sw a0, 124(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 210(sp)
+; ZVFHMIN32-NEXT:    lh a1, 546(sp)
+; ZVFHMIN32-NEXT:    lh t5, 290(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a1, 209(sp)
+; ZVFHMIN32-NEXT:    lh a1, 544(sp)
+; ZVFHMIN32-NEXT:    lh t5, 288(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 192(sp)
+; ZVFHMIN32-NEXT:    sb a1, 208(sp)
+; ZVFHMIN32-NEXT:    lh t5, 738(sp)
+; ZVFHMIN32-NEXT:    lh t6, 482(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v12
+; ZVFHMIN32-NEXT:    sw a0, 108(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN32-NEXT:    sw a0, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 177(sp)
+; ZVFHMIN32-NEXT:    lh t5, 736(sp)
+; ZVFHMIN32-NEXT:    lh t6, 480(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 29
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s5, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 28
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s6, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 176(sp)
+; ZVFHMIN32-NEXT:    lh t5, 734(sp)
+; ZVFHMIN32-NEXT:    lh t6, 478(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 27
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s7, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 26
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s8, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 175(sp)
+; ZVFHMIN32-NEXT:    lh t5, 732(sp)
+; ZVFHMIN32-NEXT:    lh t6, 476(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 25
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s4, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 24
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s3, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 174(sp)
+; ZVFHMIN32-NEXT:    lh t6, 730(sp)
+; ZVFHMIN32-NEXT:    lh s9, 474(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 23
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s2, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t6, 173(sp)
+; ZVFHMIN32-NEXT:    lh s9, 728(sp)
+; ZVFHMIN32-NEXT:    lh s10, 472(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v31
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v13
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s9, 172(sp)
+; ZVFHMIN32-NEXT:    lh s9, 726(sp)
+; ZVFHMIN32-NEXT:    lh s10, 470(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v29
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v11
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s9, 171(sp)
+; ZVFHMIN32-NEXT:    lh s10, 724(sp)
+; ZVFHMIN32-NEXT:    lh s11, 468(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v7
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v9
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
+; ZVFHMIN32-NEXT:    feq.h s10, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s10, 170(sp)
+; ZVFHMIN32-NEXT:    lh a0, 722(sp)
+; ZVFHMIN32-NEXT:    lh a1, 466(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v21
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v27
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 18(sp)
-; ZVFHMIN32-NEXT:    lh a0, 546(sp)
-; ZVFHMIN32-NEXT:    lh a1, 290(sp)
+; ZVFHMIN32-NEXT:    sb a0, 169(sp)
+; ZVFHMIN32-NEXT:    lh a0, 720(sp)
+; ZVFHMIN32-NEXT:    lh a1, 464(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 168(sp)
+; ZVFHMIN32-NEXT:    lh a0, 718(sp)
+; ZVFHMIN32-NEXT:    lh a1, 462(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s7
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s8
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, ra
+; ZVFHMIN32-NEXT:    sb a0, 167(sp)
+; ZVFHMIN32-NEXT:    lh a0, 716(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT:    lh a1, 460(sp)
+; ZVFHMIN32-NEXT:    feq.h s5, fa5, fa1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN32-NEXT:    sb a1, 166(sp)
+; ZVFHMIN32-NEXT:    lh a1, 714(sp)
+; ZVFHMIN32-NEXT:    lh a2, 458(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN32-NEXT:    sb a1, 165(sp)
+; ZVFHMIN32-NEXT:    lh a1, 712(sp)
+; ZVFHMIN32-NEXT:    lh a2, 456(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a4, fa2, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s2
+; ZVFHMIN32-NEXT:    sb a1, 164(sp)
+; ZVFHMIN32-NEXT:    lh a1, 710(sp)
+; ZVFHMIN32-NEXT:    lh a2, 454(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s9
+; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN32-NEXT:    sb a1, 163(sp)
+; ZVFHMIN32-NEXT:    lh a1, 708(sp)
+; ZVFHMIN32-NEXT:    lh a2, 452(sp)
+; ZVFHMIN32-NEXT:    feq.h s3, fa4, fa5
+; ZVFHMIN32-NEXT:    feq.h s4, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 162(sp)
+; ZVFHMIN32-NEXT:    lh a1, 706(sp)
+; ZVFHMIN32-NEXT:    lh a2, 450(sp)
+; ZVFHMIN32-NEXT:    sb s4, 129(sp)
+; ZVFHMIN32-NEXT:    sb s3, 130(sp)
+; ZVFHMIN32-NEXT:    sb s2, 131(sp)
+; ZVFHMIN32-NEXT:    sb a4, 132(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 133(sp)
+; ZVFHMIN32-NEXT:    sb a0, 134(sp)
+; ZVFHMIN32-NEXT:    sb s5, 135(sp)
+; ZVFHMIN32-NEXT:    sb a1, 161(sp)
+; ZVFHMIN32-NEXT:    lh a0, 610(sp)
+; ZVFHMIN32-NEXT:    lh a1, 354(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v5
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v23
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 17(sp)
-; ZVFHMIN32-NEXT:    lh a0, 544(sp)
-; ZVFHMIN32-NEXT:    lh a1, 288(sp)
+; ZVFHMIN32-NEXT:    sb a0, 241(sp)
+; ZVFHMIN32-NEXT:    lh a0, 608(sp)
+; ZVFHMIN32-NEXT:    lh a1, 352(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 20
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v0
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 240(sp)
+; ZVFHMIN32-NEXT:    lh a0, 606(sp)
+; ZVFHMIN32-NEXT:    lh a1, 350(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a0, 239(sp)
+; ZVFHMIN32-NEXT:    lh a0, 604(sp)
+; ZVFHMIN32-NEXT:    lh a1, 348(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 238(sp)
+; ZVFHMIN32-NEXT:    lh a0, 602(sp)
+; ZVFHMIN32-NEXT:    lh a1, 346(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 237(sp)
+; ZVFHMIN32-NEXT:    lh a0, 600(sp)
+; ZVFHMIN32-NEXT:    lh a1, 344(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 236(sp)
+; ZVFHMIN32-NEXT:    lh a0, 598(sp)
+; ZVFHMIN32-NEXT:    lh a1, 342(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 235(sp)
+; ZVFHMIN32-NEXT:    lh a0, 596(sp)
+; ZVFHMIN32-NEXT:    lh a1, 340(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 234(sp)
+; ZVFHMIN32-NEXT:    lh a0, 594(sp)
+; ZVFHMIN32-NEXT:    lh a1, 338(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 233(sp)
+; ZVFHMIN32-NEXT:    lh a0, 592(sp)
 ; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    lh t5, 336(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, t5
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN32-NEXT:    sb a0, 232(sp)
+; ZVFHMIN32-NEXT:    lh a0, 590(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a3
+; ZVFHMIN32-NEXT:    lh a2, 334(sp)
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h t6, fa4, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN32-NEXT:    sb a0, 231(sp)
+; ZVFHMIN32-NEXT:    lh a0, 588(sp)
+; ZVFHMIN32-NEXT:    lh a2, 332(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    sb a0, 230(sp)
+; ZVFHMIN32-NEXT:    lh a0, 586(sp)
+; ZVFHMIN32-NEXT:    lh a2, 330(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s8
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN32-NEXT:    sb a0, 229(sp)
+; ZVFHMIN32-NEXT:    lh a0, 584(sp)
+; ZVFHMIN32-NEXT:    lh a2, 328(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN32-NEXT:    feq.h s4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT:    sb a0, 228(sp)
+; ZVFHMIN32-NEXT:    lh a0, 582(sp)
+; ZVFHMIN32-NEXT:    lh a2, 326(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT:    sb a0, 227(sp)
+; ZVFHMIN32-NEXT:    lh a0, 580(sp)
+; ZVFHMIN32-NEXT:    lh a2, 324(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s7
+; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 226(sp)
+; ZVFHMIN32-NEXT:    lh a0, 578(sp)
+; ZVFHMIN32-NEXT:    lh a2, 322(sp)
+; ZVFHMIN32-NEXT:    sb s2, 193(sp)
+; ZVFHMIN32-NEXT:    sb a1, 194(sp)
+; ZVFHMIN32-NEXT:    sb s4, 195(sp)
+; ZVFHMIN32-NEXT:    sb a4, 196(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 197(sp)
+; ZVFHMIN32-NEXT:    sb t6, 198(sp)
+; ZVFHMIN32-NEXT:    sb t5, 199(sp)
+; ZVFHMIN32-NEXT:    sb a0, 225(sp)
+; ZVFHMIN32-NEXT:    lh a0, 766(sp)
+; ZVFHMIN32-NEXT:    lh a1, 510(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a1, 0(sp)
-; ZVFHMIN32-NEXT:    sb a0, 16(sp)
-; ZVFHMIN32-NEXT:    lh a0, 448(sp)
-; ZVFHMIN32-NEXT:    lh a1, 192(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 191(sp)
+; ZVFHMIN32-NEXT:    lh a0, 764(sp)
+; ZVFHMIN32-NEXT:    lh a1, 508(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 96(sp)
-; ZVFHMIN32-NEXT:    lh a0, 446(sp)
-; ZVFHMIN32-NEXT:    lh a1, 190(sp)
+; ZVFHMIN32-NEXT:    sb a0, 190(sp)
+; ZVFHMIN32-NEXT:    lh a0, 762(sp)
+; ZVFHMIN32-NEXT:    lh a1, 506(sp)
+; ZVFHMIN32-NEXT:    csrr a3, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a3, 3
+; ZVFHMIN32-NEXT:    add a3, sp, a3
+; ZVFHMIN32-NEXT:    addi a3, a3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    li s3, 6
+; ZVFHMIN32-NEXT:    mul a4, a4, s3
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 95(sp)
-; ZVFHMIN32-NEXT:    lh a0, 444(sp)
-; ZVFHMIN32-NEXT:    lh a1, 188(sp)
+; ZVFHMIN32-NEXT:    sb a0, 189(sp)
+; ZVFHMIN32-NEXT:    lh a0, 760(sp)
+; ZVFHMIN32-NEXT:    lh a1, 504(sp)
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    li s4, 12
+; ZVFHMIN32-NEXT:    mul s3, s3, s4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    li s4, 10
+; ZVFHMIN32-NEXT:    mul s3, s3, s4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 94(sp)
-; ZVFHMIN32-NEXT:    lh a0, 442(sp)
-; ZVFHMIN32-NEXT:    lh a1, 186(sp)
+; ZVFHMIN32-NEXT:    sb a0, 188(sp)
+; ZVFHMIN32-NEXT:    lh a0, 758(sp)
+; ZVFHMIN32-NEXT:    lh a1, 502(sp)
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    slli s3, s3, 4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 93(sp)
-; ZVFHMIN32-NEXT:    lh a0, 440(sp)
-; ZVFHMIN32-NEXT:    lh a1, 184(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    sb a0, 187(sp)
+; ZVFHMIN32-NEXT:    lh a0, 756(sp)
+; ZVFHMIN32-NEXT:    lh a1, 500(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 92(sp)
-; ZVFHMIN32-NEXT:    lh a0, 438(sp)
-; ZVFHMIN32-NEXT:    lh a1, 182(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT:    sb a0, 186(sp)
+; ZVFHMIN32-NEXT:    lh a0, 754(sp)
+; ZVFHMIN32-NEXT:    lh a1, 498(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 91(sp)
-; ZVFHMIN32-NEXT:    lh a0, 436(sp)
-; ZVFHMIN32-NEXT:    lh a1, 180(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT:    sb a0, 185(sp)
+; ZVFHMIN32-NEXT:    lh a0, 752(sp)
+; ZVFHMIN32-NEXT:    lh a1, 496(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 90(sp)
-; ZVFHMIN32-NEXT:    lh a0, 434(sp)
-; ZVFHMIN32-NEXT:    lh a1, 178(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    sb a0, 184(sp)
+; ZVFHMIN32-NEXT:    lh a0, 750(sp)
+; ZVFHMIN32-NEXT:    lh a1, 494(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 89(sp)
-; ZVFHMIN32-NEXT:    lh a0, 432(sp)
-; ZVFHMIN32-NEXT:    lh a1, 176(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    sb a0, 183(sp)
+; ZVFHMIN32-NEXT:    lh a0, 748(sp)
+; ZVFHMIN32-NEXT:    lh a1, 492(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 88(sp)
-; ZVFHMIN32-NEXT:    lh a0, 430(sp)
-; ZVFHMIN32-NEXT:    lh a1, 174(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN32-NEXT:    sb a0, 182(sp)
+; ZVFHMIN32-NEXT:    lh a0, 746(sp)
+; ZVFHMIN32-NEXT:    lh a1, 490(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 87(sp)
-; ZVFHMIN32-NEXT:    lh a0, 428(sp)
-; ZVFHMIN32-NEXT:    lh a1, 172(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    sb a0, 181(sp)
+; ZVFHMIN32-NEXT:    lh a0, 744(sp)
+; ZVFHMIN32-NEXT:    lh a1, 488(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 86(sp)
-; ZVFHMIN32-NEXT:    lh a0, 426(sp)
-; ZVFHMIN32-NEXT:    lh a1, 170(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN32-NEXT:    addi a1, sp, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN32-NEXT:    sb a0, 180(sp)
+; ZVFHMIN32-NEXT:    lh a0, 742(sp)
+; ZVFHMIN32-NEXT:    lh a7, 486(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 179(sp)
+; ZVFHMIN32-NEXT:    lh a0, 740(sp)
+; ZVFHMIN32-NEXT:    lh a7, 484(sp)
+; ZVFHMIN32-NEXT:    sb a2, 140(sp)
+; ZVFHMIN32-NEXT:    sb t1, 141(sp)
+; ZVFHMIN32-NEXT:    sb t3, 142(sp)
+; ZVFHMIN32-NEXT:    sb t4, 143(sp)
+; ZVFHMIN32-NEXT:    sb a1, 136(sp)
+; ZVFHMIN32-NEXT:    sb a6, 137(sp)
+; ZVFHMIN32-NEXT:    sb a4, 138(sp)
+; ZVFHMIN32-NEXT:    sb a3, 139(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 178(sp)
+; ZVFHMIN32-NEXT:    lh a0, 638(sp)
+; ZVFHMIN32-NEXT:    lh a1, 382(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 85(sp)
-; ZVFHMIN32-NEXT:    lh a0, 424(sp)
-; ZVFHMIN32-NEXT:    lh a1, 168(sp)
+; ZVFHMIN32-NEXT:    sb a0, 255(sp)
+; ZVFHMIN32-NEXT:    lh a0, 636(sp)
+; ZVFHMIN32-NEXT:    lh a1, 380(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 84(sp)
-; ZVFHMIN32-NEXT:    lh a0, 422(sp)
-; ZVFHMIN32-NEXT:    lh a1, 166(sp)
+; ZVFHMIN32-NEXT:    sb a0, 254(sp)
+; ZVFHMIN32-NEXT:    lh a0, 634(sp)
+; ZVFHMIN32-NEXT:    lh a1, 378(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 83(sp)
-; ZVFHMIN32-NEXT:    lh a0, 420(sp)
-; ZVFHMIN32-NEXT:    lh a1, 164(sp)
+; ZVFHMIN32-NEXT:    sb a0, 253(sp)
+; ZVFHMIN32-NEXT:    lh a0, 632(sp)
+; ZVFHMIN32-NEXT:    lh a1, 376(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 82(sp)
-; ZVFHMIN32-NEXT:    lh a0, 418(sp)
-; ZVFHMIN32-NEXT:    lh a1, 162(sp)
+; ZVFHMIN32-NEXT:    sb a0, 252(sp)
+; ZVFHMIN32-NEXT:    lh a0, 630(sp)
+; ZVFHMIN32-NEXT:    lh a1, 374(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 81(sp)
-; ZVFHMIN32-NEXT:    lh a0, 416(sp)
-; ZVFHMIN32-NEXT:    lh a1, 160(sp)
+; ZVFHMIN32-NEXT:    sb a0, 251(sp)
+; ZVFHMIN32-NEXT:    lh a0, 628(sp)
+; ZVFHMIN32-NEXT:    lh a1, 372(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v24
+; ZVFHMIN32-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v16
+; ZVFHMIN32-NEXT:    sb a0, 250(sp)
+; ZVFHMIN32-NEXT:    lh a0, 626(sp)
+; ZVFHMIN32-NEXT:    lh a1, 370(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a1, 64(sp)
-; ZVFHMIN32-NEXT:    sb a0, 80(sp)
-; ZVFHMIN32-NEXT:    lh a0, 610(sp)
-; ZVFHMIN32-NEXT:    lh a1, 354(sp)
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 249(sp)
+; ZVFHMIN32-NEXT:    lh a0, 624(sp)
+; ZVFHMIN32-NEXT:    lh a1, 368(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 49(sp)
-; ZVFHMIN32-NEXT:    lh a0, 608(sp)
-; ZVFHMIN32-NEXT:    lh a1, 352(sp)
+; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 248(sp)
+; ZVFHMIN32-NEXT:    lh a0, 622(sp)
+; ZVFHMIN32-NEXT:    lh a1, 366(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 48(sp)
-; ZVFHMIN32-NEXT:    lh a0, 606(sp)
-; ZVFHMIN32-NEXT:    lh a1, 350(sp)
+; ZVFHMIN32-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 247(sp)
+; ZVFHMIN32-NEXT:    lh a0, 620(sp)
+; ZVFHMIN32-NEXT:    lh a1, 364(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 47(sp)
-; ZVFHMIN32-NEXT:    lh a1, 604(sp)
-; ZVFHMIN32-NEXT:    lh a2, 348(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN32-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 46(sp)
-; ZVFHMIN32-NEXT:    lh a2, 602(sp)
-; ZVFHMIN32-NEXT:    lh a3, 346(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 45(sp)
-; ZVFHMIN32-NEXT:    lh a3, 600(sp)
-; ZVFHMIN32-NEXT:    lh a4, 344(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 44(sp)
-; ZVFHMIN32-NEXT:    lh a4, 598(sp)
-; ZVFHMIN32-NEXT:    lh a5, 342(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 43(sp)
-; ZVFHMIN32-NEXT:    lh a5, 596(sp)
-; ZVFHMIN32-NEXT:    lh a6, 340(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a5, 42(sp)
-; ZVFHMIN32-NEXT:    lh a6, 594(sp)
-; ZVFHMIN32-NEXT:    lh a7, 338(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a6, 41(sp)
-; ZVFHMIN32-NEXT:    lh a7, 592(sp)
-; ZVFHMIN32-NEXT:    lh t0, 336(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a7, 40(sp)
-; ZVFHMIN32-NEXT:    lh t0, 590(sp)
-; ZVFHMIN32-NEXT:    lh t1, 334(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t0, 39(sp)
-; ZVFHMIN32-NEXT:    lh t1, 588(sp)
-; ZVFHMIN32-NEXT:    lh t2, 332(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t1, 38(sp)
-; ZVFHMIN32-NEXT:    lh t2, 586(sp)
-; ZVFHMIN32-NEXT:    lh t3, 330(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 37(sp)
-; ZVFHMIN32-NEXT:    lh t2, 584(sp)
-; ZVFHMIN32-NEXT:    lh t3, 328(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 36(sp)
-; ZVFHMIN32-NEXT:    lh t2, 582(sp)
-; ZVFHMIN32-NEXT:    lh t3, 326(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 35(sp)
-; ZVFHMIN32-NEXT:    lh t2, 580(sp)
-; ZVFHMIN32-NEXT:    lh t3, 324(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 1
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 34(sp)
-; ZVFHMIN32-NEXT:    lh t2, 578(sp)
-; ZVFHMIN32-NEXT:    lh t3, 322(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a2, 5(sp)
-; ZVFHMIN32-NEXT:    sb a1, 6(sp)
-; ZVFHMIN32-NEXT:    sb a0, 7(sp)
-; ZVFHMIN32-NEXT:    sb t2, 33(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 1(sp)
-; ZVFHMIN32-NEXT:    sb a2, 2(sp)
-; ZVFHMIN32-NEXT:    sb a1, 3(sp)
-; ZVFHMIN32-NEXT:    sb a0, 4(sp)
-; ZVFHMIN32-NEXT:    lh a0, 482(sp)
-; ZVFHMIN32-NEXT:    lh a1, 226(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 113(sp)
-; ZVFHMIN32-NEXT:    lh a0, 480(sp)
-; ZVFHMIN32-NEXT:    lh a1, 224(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 112(sp)
-; ZVFHMIN32-NEXT:    lh a0, 478(sp)
-; ZVFHMIN32-NEXT:    lh a1, 222(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 111(sp)
-; ZVFHMIN32-NEXT:    lh a1, 476(sp)
-; ZVFHMIN32-NEXT:    lh a2, 220(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 110(sp)
-; ZVFHMIN32-NEXT:    lh a2, 474(sp)
-; ZVFHMIN32-NEXT:    lh a3, 218(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 109(sp)
-; ZVFHMIN32-NEXT:    lh a3, 472(sp)
-; ZVFHMIN32-NEXT:    lh a4, 216(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 108(sp)
-; ZVFHMIN32-NEXT:    lh a4, 470(sp)
-; ZVFHMIN32-NEXT:    lh a5, 214(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 107(sp)
-; ZVFHMIN32-NEXT:    lh a5, 468(sp)
-; ZVFHMIN32-NEXT:    lh a6, 212(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a5, 106(sp)
-; ZVFHMIN32-NEXT:    lh a6, 466(sp)
-; ZVFHMIN32-NEXT:    lh a7, 210(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a6, 105(sp)
-; ZVFHMIN32-NEXT:    lh a7, 464(sp)
-; ZVFHMIN32-NEXT:    lh t0, 208(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a7, 104(sp)
-; ZVFHMIN32-NEXT:    lh t0, 462(sp)
-; ZVFHMIN32-NEXT:    lh t1, 206(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t0, 103(sp)
-; ZVFHMIN32-NEXT:    lh t1, 460(sp)
-; ZVFHMIN32-NEXT:    lh t2, 204(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t1, 102(sp)
-; ZVFHMIN32-NEXT:    lh t2, 458(sp)
-; ZVFHMIN32-NEXT:    lh t3, 202(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 101(sp)
-; ZVFHMIN32-NEXT:    lh t2, 456(sp)
-; ZVFHMIN32-NEXT:    lh t3, 200(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 100(sp)
-; ZVFHMIN32-NEXT:    lh t2, 454(sp)
-; ZVFHMIN32-NEXT:    lh t3, 198(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 99(sp)
-; ZVFHMIN32-NEXT:    lh t2, 452(sp)
-; ZVFHMIN32-NEXT:    lh t3, 196(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 1
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 98(sp)
-; ZVFHMIN32-NEXT:    lh t2, 450(sp)
-; ZVFHMIN32-NEXT:    lh t3, 194(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 1
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a2, 69(sp)
-; ZVFHMIN32-NEXT:    sb a1, 70(sp)
-; ZVFHMIN32-NEXT:    sb a0, 71(sp)
-; ZVFHMIN32-NEXT:    sb t2, 97(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 65(sp)
-; ZVFHMIN32-NEXT:    sb a2, 66(sp)
-; ZVFHMIN32-NEXT:    sb a1, 67(sp)
-; ZVFHMIN32-NEXT:    sb a0, 68(sp)
-; ZVFHMIN32-NEXT:    lh a0, 638(sp)
-; ZVFHMIN32-NEXT:    lh a1, 382(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 63(sp)
-; ZVFHMIN32-NEXT:    lh a0, 636(sp)
-; ZVFHMIN32-NEXT:    lh a1, 380(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 62(sp)
-; ZVFHMIN32-NEXT:    lh a0, 634(sp)
-; ZVFHMIN32-NEXT:    lh a1, 378(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 61(sp)
-; ZVFHMIN32-NEXT:    lh a0, 632(sp)
-; ZVFHMIN32-NEXT:    lh a1, 376(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 60(sp)
-; ZVFHMIN32-NEXT:    lh a0, 630(sp)
-; ZVFHMIN32-NEXT:    lh a1, 374(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 59(sp)
-; ZVFHMIN32-NEXT:    lh a0, 628(sp)
-; ZVFHMIN32-NEXT:    lh a1, 372(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 58(sp)
-; ZVFHMIN32-NEXT:    lh a0, 626(sp)
-; ZVFHMIN32-NEXT:    lh a1, 370(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 57(sp)
-; ZVFHMIN32-NEXT:    lh a0, 624(sp)
-; ZVFHMIN32-NEXT:    lh a1, 368(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 56(sp)
-; ZVFHMIN32-NEXT:    lh a0, 622(sp)
-; ZVFHMIN32-NEXT:    lh a1, 366(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v0, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v0, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v26, v0, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v0, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v0, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v0, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v0, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v20
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 55(sp)
-; ZVFHMIN32-NEXT:    lh a0, 620(sp)
-; ZVFHMIN32-NEXT:    lh a1, 364(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v20
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v22
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 54(sp)
+; ZVFHMIN32-NEXT:    sb a0, 246(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 618(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 362(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v26
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 53(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT:    sb a0, 245(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 616(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 360(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 13
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v20
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v28
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 52(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 614(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 358(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v20
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 51(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    sb a0, 243(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 612(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 356(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v8, 11
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v18
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 50(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 12(sp)
-; ZVFHMIN32-NEXT:    sb a2, 13(sp)
-; ZVFHMIN32-NEXT:    sb a1, 14(sp)
-; ZVFHMIN32-NEXT:    sb a0, 15(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 10
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 9
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v14
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 8(sp)
-; ZVFHMIN32-NEXT:    sb a2, 9(sp)
-; ZVFHMIN32-NEXT:    sb a1, 10(sp)
-; ZVFHMIN32-NEXT:    sb a0, 11(sp)
-; ZVFHMIN32-NEXT:    lh a0, 510(sp)
-; ZVFHMIN32-NEXT:    lh a1, 254(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 127(sp)
-; ZVFHMIN32-NEXT:    lh a0, 508(sp)
-; ZVFHMIN32-NEXT:    lh a1, 252(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 126(sp)
-; ZVFHMIN32-NEXT:    lh a0, 506(sp)
-; ZVFHMIN32-NEXT:    lh a1, 250(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 125(sp)
-; ZVFHMIN32-NEXT:    lh a0, 504(sp)
-; ZVFHMIN32-NEXT:    lh a1, 248(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 124(sp)
-; ZVFHMIN32-NEXT:    lh a0, 502(sp)
-; ZVFHMIN32-NEXT:    lh a1, 246(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 123(sp)
-; ZVFHMIN32-NEXT:    lh a0, 500(sp)
-; ZVFHMIN32-NEXT:    lh a1, 244(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 122(sp)
-; ZVFHMIN32-NEXT:    lh a0, 498(sp)
-; ZVFHMIN32-NEXT:    lh a1, 242(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v24, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v24, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v24, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v24, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v24, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v24, v24, 8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 121(sp)
-; ZVFHMIN32-NEXT:    lh a2, 496(sp)
-; ZVFHMIN32-NEXT:    lh a3, 240(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    sb a5, 204(sp)
+; ZVFHMIN32-NEXT:    sb a4, 205(sp)
+; ZVFHMIN32-NEXT:    sb a2, 206(sp)
+; ZVFHMIN32-NEXT:    sb a3, 207(sp)
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 120(sp)
-; ZVFHMIN32-NEXT:    lh a4, 494(sp)
-; ZVFHMIN32-NEXT:    lh a5, 238(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 14
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 119(sp)
-; ZVFHMIN32-NEXT:    lh a4, 492(sp)
-; ZVFHMIN32-NEXT:    lh a5, 236(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v12
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 118(sp)
-; ZVFHMIN32-NEXT:    lh a4, 490(sp)
-; ZVFHMIN32-NEXT:    lh a5, 234(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v14
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 117(sp)
-; ZVFHMIN32-NEXT:    lh a4, 488(sp)
-; ZVFHMIN32-NEXT:    lh a5, 232(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 11
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 116(sp)
-; ZVFHMIN32-NEXT:    lh a4, 486(sp)
-; ZVFHMIN32-NEXT:    lh a5, 230(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v20
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 10
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 115(sp)
-; ZVFHMIN32-NEXT:    lh a4, 484(sp)
-; ZVFHMIN32-NEXT:    lh a5, 228(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v22
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 9
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v24
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 114(sp)
+; ZVFHMIN32-NEXT:    sb a2, 200(sp)
+; ZVFHMIN32-NEXT:    sb a6, 201(sp)
+; ZVFHMIN32-NEXT:    sb a7, 202(sp)
+; ZVFHMIN32-NEXT:    sb t0, 203(sp)
+; ZVFHMIN32-NEXT:    li a2, 128
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 76(sp)
-; ZVFHMIN32-NEXT:    sb a2, 77(sp)
-; ZVFHMIN32-NEXT:    sb a1, 78(sp)
-; ZVFHMIN32-NEXT:    sb a0, 79(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 72(sp)
-; ZVFHMIN32-NEXT:    sb a2, 73(sp)
-; ZVFHMIN32-NEXT:    sb a1, 74(sp)
-; ZVFHMIN32-NEXT:    sb a0, 75(sp)
-; ZVFHMIN32-NEXT:    li a0, 128
-; ZVFHMIN32-NEXT:    mv a1, sp
-; ZVFHMIN32-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle8.v v8, (a1)
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 242(sp)
+; ZVFHMIN32-NEXT:    addi a0, sp, 128
+; ZVFHMIN32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN32-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN32-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN32-NEXT:    addi sp, s0, -768
-; ZVFHMIN32-NEXT:    .cfi_def_cfa sp, 768
-; ZVFHMIN32-NEXT:    lw ra, 764(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s0, 760(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s2, 756(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s3, 752(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    addi sp, s0, -896
+; ZVFHMIN32-NEXT:    .cfi_def_cfa sp, 896
+; ZVFHMIN32-NEXT:    lw ra, 892(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s0, 888(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s2, 884(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s3, 880(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s4, 876(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s5, 872(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s6, 868(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s7, 864(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s8, 860(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s9, 856(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s10, 852(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s11, 848(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    .cfi_restore ra
 ; ZVFHMIN32-NEXT:    .cfi_restore s0
 ; ZVFHMIN32-NEXT:    .cfi_restore s2
 ; ZVFHMIN32-NEXT:    .cfi_restore s3
-; ZVFHMIN32-NEXT:    addi sp, sp, 768
+; ZVFHMIN32-NEXT:    .cfi_restore s4
+; ZVFHMIN32-NEXT:    .cfi_restore s5
+; ZVFHMIN32-NEXT:    .cfi_restore s6
+; ZVFHMIN32-NEXT:    .cfi_restore s7
+; ZVFHMIN32-NEXT:    .cfi_restore s8
+; ZVFHMIN32-NEXT:    .cfi_restore s9
+; ZVFHMIN32-NEXT:    .cfi_restore s10
+; ZVFHMIN32-NEXT:    .cfi_restore s11
+; ZVFHMIN32-NEXT:    addi sp, sp, 896
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN32-NEXT:    ret
 ;
 ; ZVFHMIN64-LABEL: fcmp_oeq_vv_v128f16:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    addi sp, sp, -768
-; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 768
-; ZVFHMIN64-NEXT:    sd ra, 760(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s0, 752(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s2, 744(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s3, 736(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    addi sp, sp, -896
+; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 896
+; ZVFHMIN64-NEXT:    sd ra, 888(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s0, 880(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s2, 872(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s3, 864(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s4, 856(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s5, 848(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s6, 840(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s7, 832(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s8, 824(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s9, 816(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s10, 808(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s11, 800(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    .cfi_offset ra, -8
 ; ZVFHMIN64-NEXT:    .cfi_offset s0, -16
 ; ZVFHMIN64-NEXT:    .cfi_offset s2, -24
 ; ZVFHMIN64-NEXT:    .cfi_offset s3, -32
-; ZVFHMIN64-NEXT:    addi s0, sp, 768
+; ZVFHMIN64-NEXT:    .cfi_offset s4, -40
+; ZVFHMIN64-NEXT:    .cfi_offset s5, -48
+; ZVFHMIN64-NEXT:    .cfi_offset s6, -56
+; ZVFHMIN64-NEXT:    .cfi_offset s7, -64
+; ZVFHMIN64-NEXT:    .cfi_offset s8, -72
+; ZVFHMIN64-NEXT:    .cfi_offset s9, -80
+; ZVFHMIN64-NEXT:    .cfi_offset s10, -88
+; ZVFHMIN64-NEXT:    .cfi_offset s11, -96
+; ZVFHMIN64-NEXT:    addi s0, sp, 896
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa s0, 0
+; ZVFHMIN64-NEXT:    csrr a1, vlenb
+; ZVFHMIN64-NEXT:    li a2, 30
+; ZVFHMIN64-NEXT:    mul a1, a1, a2
+; ZVFHMIN64-NEXT:    sub sp, sp, a1
 ; ZVFHMIN64-NEXT:    andi sp, sp, -128
 ; ZVFHMIN64-NEXT:    addi a1, a0, 128
 ; ZVFHMIN64-NEXT:    li a2, 64
+; ZVFHMIN64-NEXT:    addi a3, sp, 640
+; ZVFHMIN64-NEXT:    addi a4, sp, 384
+; ZVFHMIN64-NEXT:    addi a5, sp, 512
 ; ZVFHMIN64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle16.v v24, (a1)
 ; ZVFHMIN64-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 512
-; ZVFHMIN64-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN64-NEXT:    addi a0, sp, 256
-; ZVFHMIN64-NEXT:    vse16.v v0, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 384
-; ZVFHMIN64-NEXT:    vse16.v v16, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 128
+; ZVFHMIN64-NEXT:    vle16.v v24, (a1)
+; ZVFHMIN64-NEXT:    vse16.v v8, (a3)
+; ZVFHMIN64-NEXT:    vse16.v v0, (a4)
+; ZVFHMIN64-NEXT:    vse16.v v16, (a5)
 ; ZVFHMIN64-NEXT:    vse16.v v24, (a0)
+; ZVFHMIN64-NEXT:    lh a0, 704(sp)
+; ZVFHMIN64-NEXT:    lh a1, 448(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 160(sp)
+; ZVFHMIN64-NEXT:    lh a0, 702(sp)
+; ZVFHMIN64-NEXT:    lh a1, 446(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 159(sp)
+; ZVFHMIN64-NEXT:    lh a0, 700(sp)
+; ZVFHMIN64-NEXT:    lh a1, 444(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 158(sp)
+; ZVFHMIN64-NEXT:    lh a0, 698(sp)
+; ZVFHMIN64-NEXT:    lh a1, 442(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 157(sp)
+; ZVFHMIN64-NEXT:    lh a0, 696(sp)
+; ZVFHMIN64-NEXT:    lh a1, 440(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 156(sp)
+; ZVFHMIN64-NEXT:    lh a0, 694(sp)
+; ZVFHMIN64-NEXT:    lh a1, 438(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 155(sp)
+; ZVFHMIN64-NEXT:    lh a0, 692(sp)
+; ZVFHMIN64-NEXT:    lh a1, 436(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 154(sp)
+; ZVFHMIN64-NEXT:    lh a0, 690(sp)
+; ZVFHMIN64-NEXT:    lh a1, 434(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 153(sp)
+; ZVFHMIN64-NEXT:    lh a0, 688(sp)
+; ZVFHMIN64-NEXT:    lh a1, 432(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 152(sp)
+; ZVFHMIN64-NEXT:    lh a0, 686(sp)
+; ZVFHMIN64-NEXT:    lh a1, 430(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 151(sp)
+; ZVFHMIN64-NEXT:    lh a0, 684(sp)
+; ZVFHMIN64-NEXT:    lh a1, 428(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 150(sp)
+; ZVFHMIN64-NEXT:    lh a0, 682(sp)
+; ZVFHMIN64-NEXT:    lh a1, 426(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 149(sp)
+; ZVFHMIN64-NEXT:    lh a0, 680(sp)
+; ZVFHMIN64-NEXT:    lh a1, 424(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 148(sp)
+; ZVFHMIN64-NEXT:    lh a0, 678(sp)
+; ZVFHMIN64-NEXT:    lh a1, 422(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 147(sp)
+; ZVFHMIN64-NEXT:    lh a0, 676(sp)
+; ZVFHMIN64-NEXT:    lh a1, 420(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 146(sp)
+; ZVFHMIN64-NEXT:    lh a0, 674(sp)
+; ZVFHMIN64-NEXT:    lh a1, 418(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a0, 145(sp)
+; ZVFHMIN64-NEXT:    lh a0, 672(sp)
+; ZVFHMIN64-NEXT:    lh a1, 416(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 128(sp)
+; ZVFHMIN64-NEXT:    sb a0, 144(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 576(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 320(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 32(sp)
+; ZVFHMIN64-NEXT:    sb a0, 224(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 574(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 318(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 31(sp)
+; ZVFHMIN64-NEXT:    sb a0, 223(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 572(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 316(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 30(sp)
+; ZVFHMIN64-NEXT:    sb a0, 222(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 570(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 314(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 29(sp)
+; ZVFHMIN64-NEXT:    sb a0, 221(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 568(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 312(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 28(sp)
+; ZVFHMIN64-NEXT:    sb a0, 220(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 566(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 310(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 27(sp)
+; ZVFHMIN64-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 308(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 26(sp)
+; ZVFHMIN64-NEXT:    sb a0, 218(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 562(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 306(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 29
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 28
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 25
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 25(sp)
+; ZVFHMIN64-NEXT:    sb a0, 217(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 304(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 20
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 24(sp)
+; ZVFHMIN64-NEXT:    sb a0, 216(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 302(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 6
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 10
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 23(sp)
+; ZVFHMIN64-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 22(sp)
+; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 21(sp)
+; ZVFHMIN64-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 20(sp)
+; ZVFHMIN64-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v22
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 19(sp)
-; ZVFHMIN64-NEXT:    lh a0, 548(sp)
-; ZVFHMIN64-NEXT:    lh a1, 292(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 18(sp)
-; ZVFHMIN64-NEXT:    lh a0, 546(sp)
-; ZVFHMIN64-NEXT:    lh a1, 290(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 17(sp)
-; ZVFHMIN64-NEXT:    lh a0, 544(sp)
-; ZVFHMIN64-NEXT:    lh a1, 288(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v0
+; ZVFHMIN64-NEXT:    sb a0, 211(sp)
+; ZVFHMIN64-NEXT:    lh a1, 548(sp)
+; ZVFHMIN64-NEXT:    lh t5, 292(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v14
+; ZVFHMIN64-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a1, 0(sp)
-; ZVFHMIN64-NEXT:    sb a0, 16(sp)
-; ZVFHMIN64-NEXT:    lh a0, 448(sp)
-; ZVFHMIN64-NEXT:    lh a1, 192(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 96(sp)
-; ZVFHMIN64-NEXT:    lh a0, 446(sp)
-; ZVFHMIN64-NEXT:    lh a1, 190(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 210(sp)
+; ZVFHMIN64-NEXT:    lh a1, 546(sp)
+; ZVFHMIN64-NEXT:    lh t5, 290(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v24
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 95(sp)
-; ZVFHMIN64-NEXT:    lh a0, 444(sp)
-; ZVFHMIN64-NEXT:    lh a1, 188(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a1, 209(sp)
+; ZVFHMIN64-NEXT:    lh a1, 544(sp)
+; ZVFHMIN64-NEXT:    lh t5, 288(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 192(sp)
+; ZVFHMIN64-NEXT:    sb a1, 208(sp)
+; ZVFHMIN64-NEXT:    lh t5, 738(sp)
+; ZVFHMIN64-NEXT:    lh t6, 482(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v12
+; ZVFHMIN64-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN64-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 177(sp)
+; ZVFHMIN64-NEXT:    lh t5, 736(sp)
+; ZVFHMIN64-NEXT:    lh t6, 480(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 29
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s5, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 28
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s6, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 176(sp)
+; ZVFHMIN64-NEXT:    lh t5, 734(sp)
+; ZVFHMIN64-NEXT:    lh t6, 478(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 27
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s7, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 26
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s8, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 175(sp)
+; ZVFHMIN64-NEXT:    lh t5, 732(sp)
+; ZVFHMIN64-NEXT:    lh t6, 476(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 25
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s4, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 24
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s3, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 174(sp)
+; ZVFHMIN64-NEXT:    lh t6, 730(sp)
+; ZVFHMIN64-NEXT:    lh s9, 474(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 23
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s2, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t6, 173(sp)
+; ZVFHMIN64-NEXT:    lh s9, 728(sp)
+; ZVFHMIN64-NEXT:    lh s10, 472(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v31
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v13
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s9, 172(sp)
+; ZVFHMIN64-NEXT:    lh s9, 726(sp)
+; ZVFHMIN64-NEXT:    lh s10, 470(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v29
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v11
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s9, 171(sp)
+; ZVFHMIN64-NEXT:    lh s10, 724(sp)
+; ZVFHMIN64-NEXT:    lh s11, 468(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v7
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v9
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
+; ZVFHMIN64-NEXT:    feq.h s10, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s10, 170(sp)
+; ZVFHMIN64-NEXT:    lh a0, 722(sp)
+; ZVFHMIN64-NEXT:    lh a1, 466(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v21
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v27
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 94(sp)
-; ZVFHMIN64-NEXT:    lh a0, 442(sp)
-; ZVFHMIN64-NEXT:    lh a1, 186(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    sb a0, 169(sp)
+; ZVFHMIN64-NEXT:    lh a0, 720(sp)
+; ZVFHMIN64-NEXT:    lh a1, 464(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 168(sp)
+; ZVFHMIN64-NEXT:    lh a0, 718(sp)
+; ZVFHMIN64-NEXT:    lh a1, 462(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s7
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s8
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, ra
+; ZVFHMIN64-NEXT:    sb a0, 167(sp)
+; ZVFHMIN64-NEXT:    lh a0, 716(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT:    lh a1, 460(sp)
+; ZVFHMIN64-NEXT:    feq.h s5, fa5, fa1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 93(sp)
-; ZVFHMIN64-NEXT:    lh a0, 440(sp)
-; ZVFHMIN64-NEXT:    lh a1, 184(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN64-NEXT:    sb a1, 166(sp)
+; ZVFHMIN64-NEXT:    lh a1, 714(sp)
+; ZVFHMIN64-NEXT:    lh a2, 458(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN64-NEXT:    sb a1, 165(sp)
+; ZVFHMIN64-NEXT:    lh a1, 712(sp)
+; ZVFHMIN64-NEXT:    lh a2, 456(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a4, fa2, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s2
+; ZVFHMIN64-NEXT:    sb a1, 164(sp)
+; ZVFHMIN64-NEXT:    lh a1, 710(sp)
+; ZVFHMIN64-NEXT:    lh a2, 454(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s9
+; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN64-NEXT:    sb a1, 163(sp)
+; ZVFHMIN64-NEXT:    lh a1, 708(sp)
+; ZVFHMIN64-NEXT:    lh a2, 452(sp)
+; ZVFHMIN64-NEXT:    feq.h s3, fa4, fa5
+; ZVFHMIN64-NEXT:    feq.h s4, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 162(sp)
+; ZVFHMIN64-NEXT:    lh a1, 706(sp)
+; ZVFHMIN64-NEXT:    lh a2, 450(sp)
+; ZVFHMIN64-NEXT:    sb s4, 129(sp)
+; ZVFHMIN64-NEXT:    sb s3, 130(sp)
+; ZVFHMIN64-NEXT:    sb s2, 131(sp)
+; ZVFHMIN64-NEXT:    sb a4, 132(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a3, 133(sp)
+; ZVFHMIN64-NEXT:    sb a0, 134(sp)
+; ZVFHMIN64-NEXT:    sb s5, 135(sp)
+; ZVFHMIN64-NEXT:    sb a1, 161(sp)
+; ZVFHMIN64-NEXT:    lh a0, 610(sp)
+; ZVFHMIN64-NEXT:    lh a1, 354(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v5
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v23
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 92(sp)
-; ZVFHMIN64-NEXT:    lh a0, 438(sp)
-; ZVFHMIN64-NEXT:    lh a1, 182(sp)
+; ZVFHMIN64-NEXT:    sb a0, 241(sp)
+; ZVFHMIN64-NEXT:    lh a0, 608(sp)
+; ZVFHMIN64-NEXT:    lh a1, 352(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 20
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 91(sp)
-; ZVFHMIN64-NEXT:    lh a0, 436(sp)
-; ZVFHMIN64-NEXT:    lh a1, 180(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    sb a0, 240(sp)
+; ZVFHMIN64-NEXT:    lh a0, 606(sp)
+; ZVFHMIN64-NEXT:    lh a1, 350(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a0, 239(sp)
+; ZVFHMIN64-NEXT:    lh a0, 604(sp)
+; ZVFHMIN64-NEXT:    lh a1, 348(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 238(sp)
+; ZVFHMIN64-NEXT:    lh a0, 602(sp)
+; ZVFHMIN64-NEXT:    lh a1, 346(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 237(sp)
+; ZVFHMIN64-NEXT:    lh a0, 600(sp)
+; ZVFHMIN64-NEXT:    lh a1, 344(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 236(sp)
+; ZVFHMIN64-NEXT:    lh a0, 598(sp)
+; ZVFHMIN64-NEXT:    lh a1, 342(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 235(sp)
+; ZVFHMIN64-NEXT:    lh a0, 596(sp)
+; ZVFHMIN64-NEXT:    lh a1, 340(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 234(sp)
+; ZVFHMIN64-NEXT:    lh a0, 594(sp)
+; ZVFHMIN64-NEXT:    lh a1, 338(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 233(sp)
+; ZVFHMIN64-NEXT:    lh a0, 592(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    lh t5, 336(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, t5
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN64-NEXT:    sb a0, 232(sp)
+; ZVFHMIN64-NEXT:    lh a0, 590(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a3
+; ZVFHMIN64-NEXT:    lh a2, 334(sp)
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    feq.h t6, fa4, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 90(sp)
-; ZVFHMIN64-NEXT:    lh a0, 434(sp)
-; ZVFHMIN64-NEXT:    lh a1, 178(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN64-NEXT:    sb a0, 231(sp)
+; ZVFHMIN64-NEXT:    lh a0, 588(sp)
+; ZVFHMIN64-NEXT:    lh a2, 332(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 89(sp)
-; ZVFHMIN64-NEXT:    lh a0, 432(sp)
-; ZVFHMIN64-NEXT:    lh a1, 176(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    sb a0, 230(sp)
+; ZVFHMIN64-NEXT:    lh a0, 586(sp)
+; ZVFHMIN64-NEXT:    lh a2, 330(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s8
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 88(sp)
-; ZVFHMIN64-NEXT:    lh a0, 430(sp)
-; ZVFHMIN64-NEXT:    lh a1, 174(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN64-NEXT:    sb a0, 229(sp)
+; ZVFHMIN64-NEXT:    lh a0, 584(sp)
+; ZVFHMIN64-NEXT:    lh a2, 328(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN64-NEXT:    feq.h s4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 87(sp)
-; ZVFHMIN64-NEXT:    lh a0, 428(sp)
-; ZVFHMIN64-NEXT:    lh a1, 172(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT:    sb a0, 228(sp)
+; ZVFHMIN64-NEXT:    lh a0, 582(sp)
+; ZVFHMIN64-NEXT:    lh a2, 326(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 86(sp)
-; ZVFHMIN64-NEXT:    lh a0, 426(sp)
-; ZVFHMIN64-NEXT:    lh a1, 170(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT:    sb a0, 227(sp)
+; ZVFHMIN64-NEXT:    lh a0, 580(sp)
+; ZVFHMIN64-NEXT:    lh a2, 324(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s7
+; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 85(sp)
-; ZVFHMIN64-NEXT:    lh a0, 424(sp)
-; ZVFHMIN64-NEXT:    lh a1, 168(sp)
+; ZVFHMIN64-NEXT:    sb a0, 226(sp)
+; ZVFHMIN64-NEXT:    lh a0, 578(sp)
+; ZVFHMIN64-NEXT:    lh a2, 322(sp)
+; ZVFHMIN64-NEXT:    sb s2, 193(sp)
+; ZVFHMIN64-NEXT:    sb a1, 194(sp)
+; ZVFHMIN64-NEXT:    sb s4, 195(sp)
+; ZVFHMIN64-NEXT:    sb a4, 196(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 84(sp)
-; ZVFHMIN64-NEXT:    lh a0, 422(sp)
-; ZVFHMIN64-NEXT:    lh a1, 166(sp)
+; ZVFHMIN64-NEXT:    sb a3, 197(sp)
+; ZVFHMIN64-NEXT:    sb t6, 198(sp)
+; ZVFHMIN64-NEXT:    sb t5, 199(sp)
+; ZVFHMIN64-NEXT:    sb a0, 225(sp)
+; ZVFHMIN64-NEXT:    lh a0, 766(sp)
+; ZVFHMIN64-NEXT:    lh a1, 510(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 83(sp)
-; ZVFHMIN64-NEXT:    lh a0, 420(sp)
-; ZVFHMIN64-NEXT:    lh a1, 164(sp)
+; ZVFHMIN64-NEXT:    sb a0, 191(sp)
+; ZVFHMIN64-NEXT:    lh a0, 764(sp)
+; ZVFHMIN64-NEXT:    lh a1, 508(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 82(sp)
-; ZVFHMIN64-NEXT:    lh a0, 418(sp)
-; ZVFHMIN64-NEXT:    lh a1, 162(sp)
+; ZVFHMIN64-NEXT:    sb a0, 190(sp)
+; ZVFHMIN64-NEXT:    lh a0, 762(sp)
+; ZVFHMIN64-NEXT:    lh a1, 506(sp)
+; ZVFHMIN64-NEXT:    csrr a3, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a3, 3
+; ZVFHMIN64-NEXT:    add a3, sp, a3
+; ZVFHMIN64-NEXT:    addi a3, a3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    li s3, 6
+; ZVFHMIN64-NEXT:    mul a4, a4, s3
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 81(sp)
-; ZVFHMIN64-NEXT:    lh a0, 416(sp)
-; ZVFHMIN64-NEXT:    lh a1, 160(sp)
+; ZVFHMIN64-NEXT:    sb a0, 189(sp)
+; ZVFHMIN64-NEXT:    lh a0, 760(sp)
+; ZVFHMIN64-NEXT:    lh a1, 504(sp)
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    li s4, 12
+; ZVFHMIN64-NEXT:    mul s3, s3, s4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    li s4, 10
+; ZVFHMIN64-NEXT:    mul s3, s3, s4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v24
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v16
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a1, 64(sp)
-; ZVFHMIN64-NEXT:    sb a0, 80(sp)
-; ZVFHMIN64-NEXT:    lh a0, 610(sp)
-; ZVFHMIN64-NEXT:    lh a1, 354(sp)
+; ZVFHMIN64-NEXT:    sb a0, 188(sp)
+; ZVFHMIN64-NEXT:    lh a0, 758(sp)
+; ZVFHMIN64-NEXT:    lh a1, 502(sp)
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    slli s3, s3, 4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 49(sp)
-; ZVFHMIN64-NEXT:    lh a0, 608(sp)
-; ZVFHMIN64-NEXT:    lh a1, 352(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    sb a0, 187(sp)
+; ZVFHMIN64-NEXT:    lh a0, 756(sp)
+; ZVFHMIN64-NEXT:    lh a1, 500(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 48(sp)
-; ZVFHMIN64-NEXT:    lh a0, 606(sp)
-; ZVFHMIN64-NEXT:    lh a1, 350(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT:    sb a0, 186(sp)
+; ZVFHMIN64-NEXT:    lh a0, 754(sp)
+; ZVFHMIN64-NEXT:    lh a1, 498(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 47(sp)
-; ZVFHMIN64-NEXT:    lh a1, 604(sp)
-; ZVFHMIN64-NEXT:    lh a2, 348(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 46(sp)
-; ZVFHMIN64-NEXT:    lh a2, 602(sp)
-; ZVFHMIN64-NEXT:    lh a3, 346(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 45(sp)
-; ZVFHMIN64-NEXT:    lh a3, 600(sp)
-; ZVFHMIN64-NEXT:    lh a4, 344(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 44(sp)
-; ZVFHMIN64-NEXT:    lh a4, 598(sp)
-; ZVFHMIN64-NEXT:    lh a5, 342(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 43(sp)
-; ZVFHMIN64-NEXT:    lh a5, 596(sp)
-; ZVFHMIN64-NEXT:    lh a6, 340(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a5, 42(sp)
-; ZVFHMIN64-NEXT:    lh a6, 594(sp)
-; ZVFHMIN64-NEXT:    lh a7, 338(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a6, 41(sp)
-; ZVFHMIN64-NEXT:    lh a7, 592(sp)
-; ZVFHMIN64-NEXT:    lh t0, 336(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a7, 40(sp)
-; ZVFHMIN64-NEXT:    lh t0, 590(sp)
-; ZVFHMIN64-NEXT:    lh t1, 334(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t0, 39(sp)
-; ZVFHMIN64-NEXT:    lh t1, 588(sp)
-; ZVFHMIN64-NEXT:    lh t2, 332(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    sb a0, 185(sp)
+; ZVFHMIN64-NEXT:    lh a0, 752(sp)
+; ZVFHMIN64-NEXT:    lh a1, 496(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t1, 38(sp)
-; ZVFHMIN64-NEXT:    lh t2, 586(sp)
-; ZVFHMIN64-NEXT:    lh t3, 330(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 37(sp)
-; ZVFHMIN64-NEXT:    lh t2, 584(sp)
-; ZVFHMIN64-NEXT:    lh t3, 328(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 36(sp)
-; ZVFHMIN64-NEXT:    lh t2, 582(sp)
-; ZVFHMIN64-NEXT:    lh t3, 326(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 35(sp)
-; ZVFHMIN64-NEXT:    lh t2, 580(sp)
-; ZVFHMIN64-NEXT:    lh t3, 324(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 1
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 34(sp)
-; ZVFHMIN64-NEXT:    lh t2, 578(sp)
-; ZVFHMIN64-NEXT:    lh t3, 322(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 184(sp)
+; ZVFHMIN64-NEXT:    lh a0, 750(sp)
+; ZVFHMIN64-NEXT:    lh a1, 494(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a2, 5(sp)
-; ZVFHMIN64-NEXT:    sb a1, 6(sp)
-; ZVFHMIN64-NEXT:    sb a0, 7(sp)
-; ZVFHMIN64-NEXT:    sb t2, 33(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 1(sp)
-; ZVFHMIN64-NEXT:    sb a2, 2(sp)
-; ZVFHMIN64-NEXT:    sb a1, 3(sp)
-; ZVFHMIN64-NEXT:    sb a0, 4(sp)
-; ZVFHMIN64-NEXT:    lh a0, 482(sp)
-; ZVFHMIN64-NEXT:    lh a1, 226(sp)
+; ZVFHMIN64-NEXT:    sb a0, 183(sp)
+; ZVFHMIN64-NEXT:    lh a0, 748(sp)
+; ZVFHMIN64-NEXT:    lh a1, 492(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 113(sp)
-; ZVFHMIN64-NEXT:    lh a0, 480(sp)
-; ZVFHMIN64-NEXT:    lh a1, 224(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN64-NEXT:    sb a0, 182(sp)
+; ZVFHMIN64-NEXT:    lh a0, 746(sp)
+; ZVFHMIN64-NEXT:    lh a1, 490(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 112(sp)
-; ZVFHMIN64-NEXT:    lh a0, 478(sp)
-; ZVFHMIN64-NEXT:    lh a1, 222(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    sb a0, 181(sp)
+; ZVFHMIN64-NEXT:    lh a0, 744(sp)
+; ZVFHMIN64-NEXT:    lh a1, 488(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 111(sp)
-; ZVFHMIN64-NEXT:    lh a1, 476(sp)
-; ZVFHMIN64-NEXT:    lh a2, 220(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 110(sp)
-; ZVFHMIN64-NEXT:    lh a2, 474(sp)
-; ZVFHMIN64-NEXT:    lh a3, 218(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 109(sp)
-; ZVFHMIN64-NEXT:    lh a3, 472(sp)
-; ZVFHMIN64-NEXT:    lh a4, 216(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 108(sp)
-; ZVFHMIN64-NEXT:    lh a4, 470(sp)
-; ZVFHMIN64-NEXT:    lh a5, 214(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 107(sp)
-; ZVFHMIN64-NEXT:    lh a5, 468(sp)
-; ZVFHMIN64-NEXT:    lh a6, 212(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v10
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a5, 106(sp)
-; ZVFHMIN64-NEXT:    lh a6, 466(sp)
-; ZVFHMIN64-NEXT:    lh a7, 210(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    addi a1, sp, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN64-NEXT:    sb a0, 180(sp)
+; ZVFHMIN64-NEXT:    lh a0, 742(sp)
+; ZVFHMIN64-NEXT:    lh a7, 486(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a6, 105(sp)
-; ZVFHMIN64-NEXT:    lh a7, 464(sp)
-; ZVFHMIN64-NEXT:    lh t0, 208(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a7, 104(sp)
-; ZVFHMIN64-NEXT:    lh t0, 462(sp)
-; ZVFHMIN64-NEXT:    lh t1, 206(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t0, 103(sp)
-; ZVFHMIN64-NEXT:    lh t1, 460(sp)
-; ZVFHMIN64-NEXT:    lh t2, 204(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t1, 102(sp)
-; ZVFHMIN64-NEXT:    lh t2, 458(sp)
-; ZVFHMIN64-NEXT:    lh t3, 202(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 101(sp)
-; ZVFHMIN64-NEXT:    lh t2, 456(sp)
-; ZVFHMIN64-NEXT:    lh t3, 200(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 100(sp)
-; ZVFHMIN64-NEXT:    lh t2, 454(sp)
-; ZVFHMIN64-NEXT:    lh t3, 198(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 99(sp)
-; ZVFHMIN64-NEXT:    lh t2, 452(sp)
-; ZVFHMIN64-NEXT:    lh t3, 196(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 1
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 98(sp)
-; ZVFHMIN64-NEXT:    lh t2, 450(sp)
-; ZVFHMIN64-NEXT:    lh t3, 194(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 1
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 179(sp)
+; ZVFHMIN64-NEXT:    lh a0, 740(sp)
+; ZVFHMIN64-NEXT:    lh a7, 484(sp)
+; ZVFHMIN64-NEXT:    sb a2, 140(sp)
+; ZVFHMIN64-NEXT:    sb t1, 141(sp)
+; ZVFHMIN64-NEXT:    sb t3, 142(sp)
+; ZVFHMIN64-NEXT:    sb t4, 143(sp)
+; ZVFHMIN64-NEXT:    sb a1, 136(sp)
+; ZVFHMIN64-NEXT:    sb a6, 137(sp)
+; ZVFHMIN64-NEXT:    sb a4, 138(sp)
+; ZVFHMIN64-NEXT:    sb a3, 139(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a2, 69(sp)
-; ZVFHMIN64-NEXT:    sb a1, 70(sp)
-; ZVFHMIN64-NEXT:    sb a0, 71(sp)
-; ZVFHMIN64-NEXT:    sb t2, 97(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 65(sp)
-; ZVFHMIN64-NEXT:    sb a2, 66(sp)
-; ZVFHMIN64-NEXT:    sb a1, 67(sp)
-; ZVFHMIN64-NEXT:    sb a0, 68(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 178(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 638(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 382(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 63(sp)
+; ZVFHMIN64-NEXT:    sb a0, 255(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 636(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 380(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 62(sp)
+; ZVFHMIN64-NEXT:    sb a0, 254(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 634(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 378(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 61(sp)
+; ZVFHMIN64-NEXT:    sb a0, 253(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 632(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 376(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 60(sp)
+; ZVFHMIN64-NEXT:    sb a0, 252(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 630(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 374(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 59(sp)
+; ZVFHMIN64-NEXT:    sb a0, 251(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 628(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 372(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 58(sp)
+; ZVFHMIN64-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 250(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 626(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 370(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 57(sp)
+; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 249(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 624(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 368(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 56(sp)
+; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 248(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 622(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 366(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v0, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v0, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v26, v0, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v0, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v0, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v0, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v0, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v20
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 55(sp)
+; ZVFHMIN64-NEXT:    ld a1, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 247(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 620(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 364(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v20
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v22
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 54(sp)
+; ZVFHMIN64-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 246(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 618(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 362(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v26
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 53(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT:    sb a0, 245(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 616(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 360(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 13
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v20
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v28
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 52(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 614(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 358(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v20
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 51(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    sb a0, 243(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 612(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 356(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v8, 11
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v18
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 50(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 12(sp)
-; ZVFHMIN64-NEXT:    sb a2, 13(sp)
-; ZVFHMIN64-NEXT:    sb a1, 14(sp)
-; ZVFHMIN64-NEXT:    sb a0, 15(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 10
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 9
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v14
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 8(sp)
-; ZVFHMIN64-NEXT:    sb a2, 9(sp)
-; ZVFHMIN64-NEXT:    sb a1, 10(sp)
-; ZVFHMIN64-NEXT:    sb a0, 11(sp)
-; ZVFHMIN64-NEXT:    lh a0, 510(sp)
-; ZVFHMIN64-NEXT:    lh a1, 254(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 127(sp)
-; ZVFHMIN64-NEXT:    lh a0, 508(sp)
-; ZVFHMIN64-NEXT:    lh a1, 252(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 126(sp)
-; ZVFHMIN64-NEXT:    lh a0, 506(sp)
-; ZVFHMIN64-NEXT:    lh a1, 250(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 125(sp)
-; ZVFHMIN64-NEXT:    lh a0, 504(sp)
-; ZVFHMIN64-NEXT:    lh a1, 248(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 124(sp)
-; ZVFHMIN64-NEXT:    lh a0, 502(sp)
-; ZVFHMIN64-NEXT:    lh a1, 246(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 123(sp)
-; ZVFHMIN64-NEXT:    lh a0, 500(sp)
-; ZVFHMIN64-NEXT:    lh a1, 244(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 122(sp)
-; ZVFHMIN64-NEXT:    lh a0, 498(sp)
-; ZVFHMIN64-NEXT:    lh a1, 242(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v24, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v24, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v24, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v24, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v24, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v24, v24, 8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 121(sp)
-; ZVFHMIN64-NEXT:    lh a2, 496(sp)
-; ZVFHMIN64-NEXT:    lh a3, 240(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    sb a5, 204(sp)
+; ZVFHMIN64-NEXT:    sb a4, 205(sp)
+; ZVFHMIN64-NEXT:    sb a2, 206(sp)
+; ZVFHMIN64-NEXT:    sb a3, 207(sp)
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 120(sp)
-; ZVFHMIN64-NEXT:    lh a4, 494(sp)
-; ZVFHMIN64-NEXT:    lh a5, 238(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 14
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 119(sp)
-; ZVFHMIN64-NEXT:    lh a4, 492(sp)
-; ZVFHMIN64-NEXT:    lh a5, 236(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v12
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 118(sp)
-; ZVFHMIN64-NEXT:    lh a4, 490(sp)
-; ZVFHMIN64-NEXT:    lh a5, 234(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v14
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 117(sp)
-; ZVFHMIN64-NEXT:    lh a4, 488(sp)
-; ZVFHMIN64-NEXT:    lh a5, 232(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 11
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 116(sp)
-; ZVFHMIN64-NEXT:    lh a4, 486(sp)
-; ZVFHMIN64-NEXT:    lh a5, 230(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v20
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 10
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 115(sp)
-; ZVFHMIN64-NEXT:    lh a4, 484(sp)
-; ZVFHMIN64-NEXT:    lh a5, 228(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v22
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 9
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v24
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 114(sp)
+; ZVFHMIN64-NEXT:    sb a2, 200(sp)
+; ZVFHMIN64-NEXT:    sb a6, 201(sp)
+; ZVFHMIN64-NEXT:    sb a7, 202(sp)
+; ZVFHMIN64-NEXT:    sb t0, 203(sp)
+; ZVFHMIN64-NEXT:    li a2, 128
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 76(sp)
-; ZVFHMIN64-NEXT:    sb a2, 77(sp)
-; ZVFHMIN64-NEXT:    sb a1, 78(sp)
-; ZVFHMIN64-NEXT:    sb a0, 79(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 72(sp)
-; ZVFHMIN64-NEXT:    sb a2, 73(sp)
-; ZVFHMIN64-NEXT:    sb a1, 74(sp)
-; ZVFHMIN64-NEXT:    sb a0, 75(sp)
-; ZVFHMIN64-NEXT:    li a0, 128
-; ZVFHMIN64-NEXT:    mv a1, sp
-; ZVFHMIN64-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle8.v v8, (a1)
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 242(sp)
+; ZVFHMIN64-NEXT:    addi a0, sp, 128
+; ZVFHMIN64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN64-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN64-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN64-NEXT:    addi sp, s0, -768
-; ZVFHMIN64-NEXT:    .cfi_def_cfa sp, 768
-; ZVFHMIN64-NEXT:    ld ra, 760(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s0, 752(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s2, 744(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s3, 736(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    addi sp, s0, -896
+; ZVFHMIN64-NEXT:    .cfi_def_cfa sp, 896
+; ZVFHMIN64-NEXT:    ld ra, 888(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s0, 880(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s2, 872(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s3, 864(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s4, 856(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s5, 848(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s6, 840(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s7, 832(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s8, 824(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s9, 816(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s10, 808(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s11, 800(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    .cfi_restore ra
 ; ZVFHMIN64-NEXT:    .cfi_restore s0
 ; ZVFHMIN64-NEXT:    .cfi_restore s2
 ; ZVFHMIN64-NEXT:    .cfi_restore s3
-; ZVFHMIN64-NEXT:    addi sp, sp, 768
+; ZVFHMIN64-NEXT:    .cfi_restore s4
+; ZVFHMIN64-NEXT:    .cfi_restore s5
+; ZVFHMIN64-NEXT:    .cfi_restore s6
+; ZVFHMIN64-NEXT:    .cfi_restore s7
+; ZVFHMIN64-NEXT:    .cfi_restore s8
+; ZVFHMIN64-NEXT:    .cfi_restore s9
+; ZVFHMIN64-NEXT:    .cfi_restore s10
+; ZVFHMIN64-NEXT:    .cfi_restore s11
+; ZVFHMIN64-NEXT:    addi sp, sp, 896
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN64-NEXT:    ret
   %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index c4ef8e059a5860..d52c42891fcc3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -28,8 +28,8 @@ define <8 x i1> @icmp_eq_vx_v8i7(<8 x i7> %va, i7 %b, <8 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmseq.vv v0, v8, v9, v0.t
@@ -45,8 +45,8 @@ define <8 x i1> @icmp_eq_vx_swap_v8i7(<8 x i7> %va, i7 %b, <8 x i1> %m, i32 zero
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmseq.vv v0, v9, v8, v0.t
@@ -605,11 +605,11 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    addi a4, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a3, -128
+; CHECK-NEXT:    vle8.v v8, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a2
 ; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
index c9e6a8730eec7e..487cf3f5cf95d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
@@ -9,28 +9,28 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v0
 ; RV32-NEXT:    slli a1, a0, 18
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    srli a2, a0, 31
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 28
+; RV32-NEXT:    slli a2, a0, 27
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 19
+; RV32-NEXT:    slli a1, a0, 26
 ; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    slli a2, a0, 26
+; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    slli a1, a0, 28
 ; RV32-NEXT:    srli a2, a2, 31
-; RV32-NEXT:    vmv.v.x v9, a2
-; RV32-NEXT:    vslide1down.vx v9, v9, a1
-; RV32-NEXT:    slli a1, a0, 24
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v9, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    slli a2, a0, 19
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    vslide1down.vx v9, v9, a2
+; RV32-NEXT:    slli a2, a0, 24
 ; RV32-NEXT:    slli a0, a0, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    vslide1down.vx v9, v9, a2
 ; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
@@ -43,28 +43,28 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v0
 ; RV64-NEXT:    slli a1, a0, 50
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    srli a2, a0, 63
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 60
+; RV64-NEXT:    slli a2, a0, 59
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 51
+; RV64-NEXT:    slli a1, a0, 58
 ; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    slli a2, a0, 58
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    slli a1, a0, 60
 ; RV64-NEXT:    srli a2, a2, 63
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    vslide1down.vx v9, v9, a1
-; RV64-NEXT:    slli a1, a0, 56
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v9, v9, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    slli a2, a0, 51
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    vslide1down.vx v9, v9, a2
+; RV64-NEXT:    slli a2, a0, 56
 ; RV64-NEXT:    slli a0, a0, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64-NEXT:    vslidedown.vi v8, v9, 4, v0.t
@@ -80,10 +80,10 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsrl.vi v10, v10, 1
 ; CHECK-NEXT:    vrsub.vi v11, v10, 3
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
@@ -100,15 +100,14 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) {
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v12, 1
 ; RV32-NEXT:    vmv.v.i v13, 6
-; RV32-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
-; RV32-NEXT:    vslideup.vi v13, v12, 1
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vmv4r.v v20, v8
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vmv2r.v v22, v14
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV32-NEXT:    vslideup.vi v13, v12, 1
 ; RV32-NEXT:    vmv.v.i v0, 10
+; RV32-NEXT:    vmv2r.v v22, v14
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; RV32-NEXT:    vnsrl.wx v8, v20, a0
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v13, v0.t
@@ -116,19 +115,20 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) {
 ;
 ; RV64-LABEL: v4i32_v16i32:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 32
 ; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v8, 8
 ; RV64-NEXT:    vmv4r.v v20, v8
-; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 10
 ; RV64-NEXT:    vmv2r.v v22, v12
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vnsrl.wx v8, v20, a0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 10
 ; RV64-NEXT:    li a0, 3
 ; RV64-NEXT:    slli a0, a0, 33
 ; RV64-NEXT:    addi a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; RV64-NEXT:    vrgatherei16.vv v8, v16, v10, v0.t
@@ -151,21 +151,21 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v8, 1
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    lw a0, 36(sp)
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    lw a0, 120(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    lw a0, 36(sp)
+; RV32-NEXT:    vmv.x.s a1, v16
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v10, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    lw a1, 120(sp)
+; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    addi sp, s0, -256
 ; RV32-NEXT:    .cfi_def_cfa sp, 256
 ; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
@@ -189,21 +189,21 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    li a0, 32
 ; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v8, 1
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    lw a0, 36(sp)
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    lw a0, 120(sp)
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    lw a0, 36(sp)
+; RV64-NEXT:    vmv.x.s a1, v16
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    lw a1, 120(sp)
+; RV64-NEXT:    vslide1down.vx v9, v9, a0
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    .cfi_def_cfa sp, 256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
@@ -251,22 +251,24 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) {
 ; CHECK-LABEL: v16i32_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    addi a1, a0, 265
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    vmerge.vim v10, v9, 2, v0
+; CHECK-NEXT:    addi a1, a0, 265
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 4
 ; CHECK-NEXT:    addi a1, a1, 548
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    addi a0, a0, -1856
-; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 0, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v9, v9, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v16, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -285,22 +287,23 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) {
 ; CHECK-NEXT:    addi a1, a1, 1161
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 270865
+; CHECK-NEXT:    addi a1, a1, 548
+; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    lui a1, 100550
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 3
-; CHECK-NEXT:    vmerge.vim v10, v10, 2, v0
-; CHECK-NEXT:    lui a0, 270865
-; CHECK-NEXT:    addi a0, a0, 548
+; CHECK-NEXT:    vmv.v.i v16, 3
+; CHECK-NEXT:    addi a0, a1, 64
+; CHECK-NEXT:    vmerge.vim v18, v16, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    lui a0, 100550
-; CHECK-NEXT:    addi a0, a0, 64
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 0, v0
+; CHECK-NEXT:    vmv.s.x v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v18, 0, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vim v16, v18, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v24, v10
+; CHECK-NEXT:    vsext.vf2 v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index e4ea64c0bf955d..2aa7cb8687f544 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -53,12 +53,10 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 ; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLA-NEXT:    vslideup.vi v14, v15, 1
 ; VLA-NEXT:    vslideup.vi v16, v13, 1
-; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v14, 2
-; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLA-NEXT:    vslideup.vi v10, v11, 1
 ; VLA-NEXT:    vslideup.vi v8, v9, 1
 ; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v14, 2
 ; VLA-NEXT:    vslideup.vi v8, v10, 2
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 4
@@ -71,12 +69,10 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vslideup.vi v14, v15, 1
 ; VLS-NEXT:    vslideup.vi v17, v13, 1
-; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; VLS-NEXT:    vslideup.vi v17, v14, 2
-; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vslideup.vi v10, v11, 1
 ; VLS-NEXT:    vslideup.vi v16, v9, 1
 ; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v17, v14, 2
 ; VLS-NEXT:    vslideup.vi v16, v10, 2
 ; VLS-NEXT:    vmv2r.v v8, v16
 ; VLS-NEXT:    ret
@@ -131,16 +127,15 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 ; VLA-LABEL: concat_8xv2i32:
 ; VLA:       # %bb.0:
 ; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    vmv1r.v v18, v10
 ; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; VLA-NEXT:    vslideup.vi v16, v15, 2
 ; VLA-NEXT:    vslideup.vi v12, v13, 2
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v16, 4
-; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; VLA-NEXT:    vslideup.vi v10, v11, 2
+; VLA-NEXT:    vslideup.vi v18, v11, 2
 ; VLA-NEXT:    vslideup.vi v8, v9, 2
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v10, 4
+; VLA-NEXT:    vslideup.vi v12, v16, 4
+; VLA-NEXT:    vslideup.vi v8, v18, 4
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
 ; VLA-NEXT:    ret
@@ -190,10 +185,10 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ; VLA-NEXT:    vmv2r.v v20, v14
 ; VLA-NEXT:    vmv2r.v v16, v12
 ; VLA-NEXT:    vmv2r.v v12, v10
+; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; VLA-NEXT:    vslideup.vi v16, v20, 8
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
-; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 16
 ; VLA-NEXT:    ret
@@ -212,22 +207,20 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ; VLA:       # %bb.0:
 ; VLA-NEXT:    vmv1r.v v18, v15
 ; VLA-NEXT:    vmv1r.v v20, v14
-; VLA-NEXT:    vmv1r.v v22, v13
+; VLA-NEXT:    vmv1r.v v24, v13
 ; VLA-NEXT:    vmv1r.v v16, v12
-; VLA-NEXT:    vmv1r.v v14, v11
+; VLA-NEXT:    vmv1r.v v26, v11
 ; VLA-NEXT:    vmv1r.v v12, v10
 ; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; VLA-NEXT:    vslideup.vi v20, v18, 4
-; VLA-NEXT:    vslideup.vi v16, v22, 4
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v20, 8
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vslideup.vi v16, v24, 4
+; VLA-NEXT:    vslideup.vi v12, v26, 4
 ; VLA-NEXT:    vslideup.vi v8, v10, 4
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v20, 8
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
-; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 16
 ; VLA-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index a8f75f8d1c24d1..f04faf5cd2c54f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -15,15 +15,15 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 3
 ; CHECK-NEXT:    vmul.vx v9, v9, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vadd.vi v9, v9, -8
 ; CHECK-NEXT:    li a0, 56
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v10, v9, -8
+; CHECK-NEXT:    vrgather.vv v11, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -42,13 +42,13 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 3
 ; CHECK-NEXT:    vmadd.vx v10, a0, v9
+; CHECK-NEXT:    li a0, 24
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    li a0, 24
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
@@ -67,13 +67,13 @@ define void @deinterleave4_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vsll.vi v9, v9, 2
@@ -100,11 +100,11 @@ define void @deinterleave4_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vmv.v.i v9, -9
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vmacc.vx v9, a0, v10
 ; CHECK-NEXT:    vsll.vi v10, v10, 2
 ; CHECK-NEXT:    vadd.vi v10, v10, 1
 ; CHECK-NEXT:    vrgather.vv v11, v8, v10
-; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -126,15 +126,15 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 5
-; CHECK-NEXT:    vmul.vx v9, v9, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vadd.vi v9, v9, -8
 ; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vmul.vx v9, v9, a0
+; CHECK-NEXT:    vadd.vi v10, v9, -8
+; CHECK-NEXT:    vrgather.vv v11, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -153,8 +153,8 @@ define void @deinterleave5_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 5
 ; CHECK-NEXT:    vmadd.vx v10, a0, v9
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -176,9 +176,9 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 6
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vmul.vx v9, v9, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -202,8 +202,8 @@ define void @deinterleave6_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 6
 ; CHECK-NEXT:    vmadd.vx v10, a0, v9
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -225,9 +225,9 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 7
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vmul.vx v9, v9, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -250,8 +250,8 @@ define void @deinterleave7_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vmv.v.i v9, -6
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 6
-; CHECK-NEXT:    vmadd.vx v10, a0, v9
 ; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    vmadd.vx v10, a0, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index d461fa8378cffc..73f5c2fc251713 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -57,8 +57,8 @@ define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v11, v9, 1
-; CHECK-NEXT:    vslideup.vi v11, v9, 1
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 1
+; CHECK-NEXT:    vslideup.vi v11, v9, 1
 ; CHECK-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index b70aff413aec53..863b3008c2ce99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -93,18 +93,19 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) {
 ; NO-ZVBB-LABEL: reverse_v32i1:
 ; NO-ZVBB:       # %bb.0:
 ; NO-ZVBB-NEXT:    li a0, 32
-; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; NO-ZVBB-NEXT:    vmv.v.i v8, 0
-; NO-ZVBB-NEXT:    vmerge.vim v8, v8, 1, v0
 ; NO-ZVBB-NEXT:    csrr a1, vlenb
+; NO-ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; NO-ZVBB-NEXT:    vid.v v8
+; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; NO-ZVBB-NEXT:    vmv.v.i v10, 0
 ; NO-ZVBB-NEXT:    addi a2, a1, -1
+; NO-ZVBB-NEXT:    slli a1, a1, 1
+; NO-ZVBB-NEXT:    vmerge.vim v10, v10, 1, v0
 ; NO-ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; NO-ZVBB-NEXT:    vid.v v10
-; NO-ZVBB-NEXT:    vrsub.vx v10, v10, a2
+; NO-ZVBB-NEXT:    vrsub.vx v8, v8, a2
 ; NO-ZVBB-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; NO-ZVBB-NEXT:    vrgatherei16.vv v13, v8, v10
-; NO-ZVBB-NEXT:    vrgatherei16.vv v12, v9, v10
-; NO-ZVBB-NEXT:    slli a1, a1, 1
+; NO-ZVBB-NEXT:    vrgatherei16.vv v13, v10, v8
+; NO-ZVBB-NEXT:    vrgatherei16.vv v12, v11, v8
 ; NO-ZVBB-NEXT:    addi a1, a1, -32
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; NO-ZVBB-NEXT:    vslidedown.vx v8, v12, a1
@@ -124,23 +125,24 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) {
 ; NO-ZVBB-LABEL: reverse_v64i1:
 ; NO-ZVBB:       # %bb.0:
 ; NO-ZVBB-NEXT:    li a0, 64
+; NO-ZVBB-NEXT:    csrr a1, vlenb
+; NO-ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; NO-ZVBB-NEXT:    vid.v v12
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; NO-ZVBB-NEXT:    vmv.v.i v8, 0
-; NO-ZVBB-NEXT:    vmerge.vim v12, v8, 1, v0
-; NO-ZVBB-NEXT:    csrr a1, vlenb
 ; NO-ZVBB-NEXT:    addi a2, a1, -1
+; NO-ZVBB-NEXT:    slli a1, a1, 2
+; NO-ZVBB-NEXT:    vmerge.vim v8, v8, 1, v0
 ; NO-ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; NO-ZVBB-NEXT:    vid.v v8
-; NO-ZVBB-NEXT:    vrsub.vx v16, v8, a2
+; NO-ZVBB-NEXT:    vrsub.vx v12, v12, a2
 ; NO-ZVBB-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; NO-ZVBB-NEXT:    vrgatherei16.vv v11, v12, v16
-; NO-ZVBB-NEXT:    vrgatherei16.vv v10, v13, v16
-; NO-ZVBB-NEXT:    vrgatherei16.vv v9, v14, v16
-; NO-ZVBB-NEXT:    vrgatherei16.vv v8, v15, v16
-; NO-ZVBB-NEXT:    slli a1, a1, 2
+; NO-ZVBB-NEXT:    vrgatherei16.vv v19, v8, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v18, v9, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v17, v10, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v16, v11, v12
 ; NO-ZVBB-NEXT:    addi a1, a1, -64
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; NO-ZVBB-NEXT:    vslidedown.vx v8, v8, a1
+; NO-ZVBB-NEXT:    vslidedown.vx v8, v16, a1
 ; NO-ZVBB-NEXT:    vmsne.vi v0, v8, 0
 ; NO-ZVBB-NEXT:    ret
 ;
@@ -157,13 +159,15 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-LABEL: reverse_v128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vrsub.vx v24, v8, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v15, v16, v24
@@ -174,7 +178,6 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v21, v24
 ; CHECK-NEXT:    vrgatherei16.vv v9, v22, v24
 ; CHECK-NEXT:    vrgatherei16.vv v8, v23, v24
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    addi a1, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
@@ -253,15 +256,15 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: reverse_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
+; CHECK-NEXT:    addi a0, a0, -32
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v13, v8, v10
 ; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -32
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -274,20 +277,20 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-LABEL: reverse_v64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v15, v8, v16
-; CHECK-NEXT:    vrgatherei16.vv v14, v9, v16
-; CHECK-NEXT:    vrgatherei16.vv v13, v10, v16
-; CHECK-NEXT:    vrgatherei16.vv v12, v11, v16
+; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
 ; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v19, v8, v12
+; CHECK-NEXT:    vrgatherei16.vv v18, v9, v12
+; CHECK-NEXT:    vrgatherei16.vv v17, v10, v12
+; CHECK-NEXT:    vrgatherei16.vv v16, v11, v12
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <64 x i8> %a, <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <64 x i8> %res
@@ -349,10 +352,10 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: reverse_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
@@ -368,20 +371,20 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-LABEL: reverse_v32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i16> %res
@@ -430,14 +433,14 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: reverse_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -450,10 +453,10 @@ define <16 x i32> @reverse_v16i32(<16 x i32> %a) {
 ; CHECK-LABEL: reverse_v16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vrsub.vx v16, v12, a1
 ; CHECK-NEXT:    vrgather.vv v15, v8, v16
 ; CHECK-NEXT:    vrgather.vv v14, v9, v16
@@ -491,14 +494,14 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: reverse_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -4
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -511,19 +514,19 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) {
 ; CHECK-LABEL: reverse_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i64> %res
@@ -586,10 +589,10 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 ; CHECK-LABEL: reverse_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
@@ -605,20 +608,20 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <32 x half> %a, <32 x half> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x half> %res
@@ -667,14 +670,14 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: reverse_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -687,10 +690,10 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) {
 ; CHECK-LABEL: reverse_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vrsub.vx v16, v12, a1
 ; CHECK-NEXT:    vrgather.vv v15, v8, v16
 ; CHECK-NEXT:    vrgather.vv v14, v9, v16
@@ -728,14 +731,14 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) {
 ; CHECK-LABEL: reverse_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -4
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -748,19 +751,19 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 ; CHECK-LABEL: reverse_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x double> %res
@@ -946,9 +949,9 @@ define <16 x i8> @reverse_v16i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vrsub.vi v12, v11, 15
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
@@ -965,25 +968,24 @@ define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v9, v8, v12
-; CHECK-NEXT:    vrgatherei16.vv v8, v11, v12
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v15, v8, v12
+; CHECK-NEXT:    vrgatherei16.vv v14, v9, v12
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vi v12, v12, 15
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, mu
+; CHECK-NEXT:    vrsub.vi v12, v8, 15
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i8> %a, <16 x i8> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1035,20 +1037,22 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrsub.vi v12, v12, 7
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
-; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrgather.vv v15, v8, v9
+; CHECK-NEXT:    vrgather.vv v14, v11, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vi v12, v12, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i16> %a, <8 x i16> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1060,27 +1064,29 @@ define <32 x i16> @reverse_v32i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vx v14, v10, a1
-; CHECK-NEXT:    vrgather.vv v11, v8, v14
-; CHECK-NEXT:    vrgather.vv v8, v10, v14
-; CHECK-NEXT:    vrgather.vv v10, v9, v14
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    addi a0, a0, -32
 ; CHECK-NEXT:    vrsub.vi v16, v16, 15
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrgather.vv v23, v8, v10
+; CHECK-NEXT:    vrgather.vv v20, v11, v10
+; CHECK-NEXT:    vrgather.vv v22, v9, v10
+; CHECK-NEXT:    vmv.v.v v21, v20
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v20, a0
 ; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i16> %a, <16 x i16> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1116,22 +1122,22 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vrsub.vi v12, v12, 3
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrsub.vi v12, v11, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vrgather.vv v15, v8, v9
+; CHECK-NEXT:    vrgather.vv v14, v11, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1143,24 +1149,24 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrsub.vi v16, v14, 7
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v10
-; CHECK-NEXT:    vrgather.vv v16, v8, v10
-; CHECK-NEXT:    vrgather.vv v18, v9, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    addi a0, a0, -16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v16, v14, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT:    vrgather.vv v23, v8, v10
+; CHECK-NEXT:    vrgather.vv v20, v11, v10
+; CHECK-NEXT:    vrgather.vv v22, v9, v10
+; CHECK-NEXT:    vmv.v.v v21, v20
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v20, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i32> %a, <8 x i32> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1170,32 +1176,34 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) {
 define <32 x i32> @reverse_v32i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: reverse_v32i32_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v0, v12
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vmv.s.x v28, a1
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v20, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v20
-; CHECK-NEXT:    vrgather.vv v14, v9, v20
-; CHECK-NEXT:    vrgather.vv v13, v10, v20
-; CHECK-NEXT:    vrgather.vv v8, v9, v20
-; CHECK-NEXT:    vrgather.vv v12, v11, v20
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vmv2r.v v10, v8
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vid.v v20
+; CHECK-NEXT:    addi a0, a0, -32
 ; CHECK-NEXT:    vrsub.vi v24, v20, 15
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vrgather.vv v23, v8, v12
+; CHECK-NEXT:    vrgather.vv v22, v9, v12
+; CHECK-NEXT:    vrgather.vv v21, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v13, v12
+; CHECK-NEXT:    vrgather.vv v20, v11, v12
+; CHECK-NEXT:    vmv.v.v v17, v16
+; CHECK-NEXT:    vmv2r.v v18, v16
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
+; CHECK-NEXT:    vmv4r.v v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v28
 ; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i32> %a, <16 x i32> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1222,24 +1230,24 @@ define <8 x i64> @reverse_v8i64_2(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vrsub.vi v16, v14, 3
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v10
-; CHECK-NEXT:    vrgather.vv v16, v8, v10
-; CHECK-NEXT:    vrgather.vv v18, v9, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v16, v14, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vrgather.vv v23, v8, v10
+; CHECK-NEXT:    vrgather.vv v20, v11, v10
+; CHECK-NEXT:    vrgather.vv v22, v9, v10
+; CHECK-NEXT:    vmv.v.v v21, v20
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v20, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1291,20 +1299,22 @@ define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrsub.vi v12, v12, 7
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
-; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrgather.vv v15, v8, v9
+; CHECK-NEXT:    vrgather.vv v14, v11, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vi v12, v12, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x half> %a, <8 x half> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1315,20 +1325,20 @@ define <32 x half> @reverse_v32f16_2(<16 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vx v12, v10, a1
-; CHECK-NEXT:    vrgather.vv v11, v8, v12
-; CHECK-NEXT:    vrgather.vv v8, v10, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v12
-; CHECK-NEXT:    vmv.v.v v9, v8
+; CHECK-NEXT:    vrsub.vx v10, v10, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vrgather.vv v15, v8, v10
+; CHECK-NEXT:    vrgather.vv v12, v11, v10
+; CHECK-NEXT:    vrgather.vv v14, v9, v10
+; CHECK-NEXT:    vmv.v.v v13, v12
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x half> %a, <16 x half> poison,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x half> %res
@@ -1363,22 +1373,22 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vrsub.vi v12, v12, 3
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
-; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrsub.vi v12, v11, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vrgather.vv v15, v8, v9
+; CHECK-NEXT:    vrgather.vv v14, v11, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1390,24 +1400,24 @@ define <16 x float> @reverse_v16f32_2(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrsub.vi v16, v14, 7
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v10
-; CHECK-NEXT:    vrgather.vv v16, v8, v10
-; CHECK-NEXT:    vrgather.vv v18, v9, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    addi a0, a0, -16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v16, v14, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT:    vrgather.vv v23, v8, v10
+; CHECK-NEXT:    vrgather.vv v20, v11, v10
+; CHECK-NEXT:    vrgather.vv v22, v9, v10
+; CHECK-NEXT:    vmv.v.v v21, v20
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v20, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1434,24 +1444,24 @@ define <8 x double> @reverse_v8f64_2(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vrsub.vi v16, v14, 3
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v10
-; CHECK-NEXT:    vrgather.vv v16, v8, v10
-; CHECK-NEXT:    vrgather.vv v18, v9, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v16, v14, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vrgather.vv v23, v8, v10
+; CHECK-NEXT:    vrgather.vv v20, v11, v10
+; CHECK-NEXT:    vrgather.vv v22, v9, v10
+; CHECK-NEXT:    vmv.v.v v21, v20
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v20, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1464,18 +1474,19 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) vscale_range(16, 1024) {
 ; CHECK-LABEL: reverse_v256i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vx v10, v10, a2
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v13, v8, v10
-; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vrgatherei16.vv v13, v10, v8
+; CHECK-NEXT:    vrgatherei16.vv v12, v11, v8
 ; CHECK-NEXT:    addi a1, a1, -256
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 3cfcb4398a1f00..02355d331e13f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -482,15 +482,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 48
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i16_as_i64_16:
@@ -528,15 +528,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i16_as_i64_32:
@@ -574,15 +574,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 16
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i16_as_i64_48:
@@ -620,15 +620,15 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v10, v12, a1
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vsll.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i32_as_i64:
@@ -691,15 +691,15 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 48
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f16_as_i64_16:
@@ -737,15 +737,15 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f16_as_i64_32:
@@ -783,15 +783,15 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 16
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f16_as_i64_48:
@@ -829,15 +829,15 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v10, v12, a1
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vsll.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f32_as_i64:
@@ -876,15 +876,15 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v10, v12, a1
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vsll.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f32_as_i64_exact:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 15c2c2298c0dd6..8f6240e112cdd7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -386,22 +386,23 @@ define void @vnsrl_0_i8_undef3(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    li a0, -32
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    lui a0, 24640
-; CHECK-NEXT:    addi a0, a0, 6
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    addi a0, a0, 6
 ; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vadd.vi v9, v9, -8
-; CHECK-NEXT:    li a0, -32
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vv v11, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v9, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -419,18 +420,18 @@ define void @vnsrl_0_i8_undef_negative(ptr %in, ptr %out) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI17_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
 ; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vadd.vi v9, v9, -8
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    li a0, 48
+; CHECK-NEXT:    vadd.vv v9, v9, v9
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v9, v9, -8
+; CHECK-NEXT:    vrgather.vv v11, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v9, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
index 5232d0d69fad08..74f2cec04f0de9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
@@ -88,17 +88,17 @@ define void @store_v6i1(ptr %p, <6 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vfirst.m a1, v0
-; CHECK-NEXT:    seqz a1, a1
 ; CHECK-NEXT:    vmv.x.s a2, v0
+; CHECK-NEXT:    seqz a1, a1
 ; CHECK-NEXT:    andi a3, a2, 2
+; CHECK-NEXT:    andi a4, a2, 4
 ; CHECK-NEXT:    or a1, a1, a3
-; CHECK-NEXT:    andi a3, a2, 4
-; CHECK-NEXT:    andi a4, a2, 8
-; CHECK-NEXT:    or a3, a3, a4
-; CHECK-NEXT:    or a1, a1, a3
-; CHECK-NEXT:    andi a3, a2, 16
+; CHECK-NEXT:    andi a3, a2, 8
+; CHECK-NEXT:    or a3, a4, a3
+; CHECK-NEXT:    andi a4, a2, 16
 ; CHECK-NEXT:    andi a2, a2, -32
-; CHECK-NEXT:    or a2, a3, a2
+; CHECK-NEXT:    or a1, a1, a3
+; CHECK-NEXT:    or a2, a4, a2
 ; CHECK-NEXT:    or a1, a1, a2
 ; CHECK-NEXT:    andi a1, a1, 63
 ; CHECK-NEXT:    sb a1, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index ddde1e94abbde9..a526974f77730b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -255,14 +255,14 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
 ; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a0)
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a4, a0, a4
-; RV32-NEXT:    vle16.v v10, (a4)
-; RV32-NEXT:    add a2, a4, a2
-; RV32-NEXT:    vle16.v v9, (a2)
+; RV32-NEXT:    vle16.v v9, (a0)
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    vle16.v v10, (a0)
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    vle16.v v11, (a0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vslideup.vi v10, v9, 4
-; RV32-NEXT:    vslideup.vi v8, v11, 4
+; RV32-NEXT:    vslideup.vi v8, v9, 4
+; RV32-NEXT:    vslideup.vi v10, v11, 4
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vslideup.vi v8, v10, 8
 ; RV32-NEXT:    vse16.v v8, (a1)
@@ -273,14 +273,14 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
 ; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; RV64-NEXT:    vle16.v v8, (a0)
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a3, a0, a3
-; RV64-NEXT:    vle16.v v10, (a3)
-; RV64-NEXT:    add a2, a3, a2
-; RV64-NEXT:    vle16.v v9, (a2)
+; RV64-NEXT:    vle16.v v9, (a0)
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    vle16.v v10, (a0)
+; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    vle16.v v11, (a0)
 ; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV64-NEXT:    vslideup.vi v10, v9, 4
-; RV64-NEXT:    vslideup.vi v8, v11, 4
+; RV64-NEXT:    vslideup.vi v8, v9, 4
+; RV64-NEXT:    vslideup.vi v10, v11, 4
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vslideup.vi v8, v10, 8
 ; RV64-NEXT:    vse16.v v8, (a1)
@@ -291,14 +291,14 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
 ; ZVE64F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVE64F-NEXT:    vle16.v v8, (a0)
 ; ZVE64F-NEXT:    add a0, a0, a2
-; ZVE64F-NEXT:    add a3, a0, a3
-; ZVE64F-NEXT:    vle16.v v10, (a3)
-; ZVE64F-NEXT:    add a2, a3, a2
-; ZVE64F-NEXT:    vle16.v v9, (a2)
+; ZVE64F-NEXT:    vle16.v v9, (a0)
+; ZVE64F-NEXT:    add a0, a0, a3
+; ZVE64F-NEXT:    vle16.v v10, (a0)
+; ZVE64F-NEXT:    add a0, a0, a2
 ; ZVE64F-NEXT:    vle16.v v11, (a0)
 ; ZVE64F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVE64F-NEXT:    vslideup.vi v10, v9, 4
-; ZVE64F-NEXT:    vslideup.vi v8, v11, 4
+; ZVE64F-NEXT:    vslideup.vi v8, v9, 4
+; ZVE64F-NEXT:    vslideup.vi v10, v11, 4
 ; ZVE64F-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVE64F-NEXT:    vslideup.vi v8, v10, 8
 ; ZVE64F-NEXT:    vse16.v v8, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 28202dc07f9564..d506842b5eff67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -54,11 +54,11 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-LABEL: gather_masked:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a2, a0, 1024
-; CHECK-NEXT:    lui a3, 983765
-; CHECK-NEXT:    addi a3, a3, 873
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a3
+; CHECK-NEXT:    lui a4, 983765
 ; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    addi a4, a4, 873
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a4
 ; CHECK-NEXT:    li a4, 5
 ; CHECK-NEXT:  .LBB1_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -303,9 +303,9 @@ define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vlse8.v v9, (a0), a3
+; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse8.v v8, (a0), a3
-; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 160
 ; CHECK-NEXT:    bne a1, a2, .LBB6_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -348,9 +348,9 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    vle8.v v9, (a1)
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vlse8.v v10, (a0), a4, v0.t
+; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    vadd.vv v9, v10, v9
 ; CHECK-NEXT:    vsse8.v v9, (a0), a4, v0.t
-; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 160
 ; CHECK-NEXT:    bne a1, a2, .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -384,14 +384,14 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: gather_pow2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:  .LBB8_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
-; CHECK-NEXT:    vlse32.v v8, (a1), a3
+; CHECK-NEXT:    vlse32.v v8, (a1), a2
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
@@ -400,7 +400,7 @@ define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 128
-; CHECK-NEXT:    bne a0, a2, .LBB8_1
+; CHECK-NEXT:    bne a0, a3, .LBB8_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -432,21 +432,21 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: scatter_pow2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a1, a2
-; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    add a3, a1, a3
 ; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:  .LBB9_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:    vlse32.v v9, (a0), a4
+; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse32.v v8, (a0), a4
-; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    bne a1, a2, .LBB9_1
+; CHECK-NEXT:    bne a1, a3, .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -491,14 +491,14 @@ define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    addi a4, a0, 32
 ; CHECK-NEXT:    addi a5, a1, -128
-; CHECK-NEXT:    vlse32.v v8, (a5), a3
-; CHECK-NEXT:    vlse32.v v9, (a1), a3
-; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vlse32.v v8, (a1), a3
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vlse32.v v10, (a5), a3
 ; CHECK-NEXT:    vle32.v v11, (a4)
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vadd.vv v9, v11, v9
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    vse32.v v9, (a4)
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vadd.vv v8, v11, v8
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    vse32.v v8, (a4)
 ; CHECK-NEXT:    addi a0, a0, 64
 ; CHECK-NEXT:    addi a1, a1, 256
 ; CHECK-NEXT:    bne a0, a2, .LBB10_1
@@ -551,9 +551,9 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vlse32.v v8, (a1), a3
 ; CHECK-NEXT:    vlse32.v v9, (a0), a4
+; CHECK-NEXT:    addi a5, a1, 16
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse32.v v8, (a0), a4
-; CHECK-NEXT:    addi a5, a1, 16
 ; CHECK-NEXT:    vlse32.v v8, (a5), a3
 ; CHECK-NEXT:    addi a5, a0, 4
 ; CHECK-NEXT:    vlse32.v v9, (a5), a4
@@ -569,10 +569,10 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    vlse32.v v8, (a5), a3
 ; CHECK-NEXT:    addi a5, a0, 12
 ; CHECK-NEXT:    vlse32.v v9, (a5), a4
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsse32.v v8, (a5), a4
 ; CHECK-NEXT:    addi a2, a2, -8
 ; CHECK-NEXT:    addi a1, a1, 512
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsse32.v v8, (a5), a4
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    bnez a2, .LBB11_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -638,13 +638,13 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB12_1: # %bb2
 ; V-NEXT:    # =>This Inner Loop Header: Depth=1
-; V-NEXT:    addi a4, a1, 80
 ; V-NEXT:    vlse64.v v8, (a1), a3
+; V-NEXT:    addi a4, a1, 80
 ; V-NEXT:    vlse64.v v9, (a4), a3
 ; V-NEXT:    addi a4, a0, 16
 ; V-NEXT:    vse64.v v8, (a0)
-; V-NEXT:    vse64.v v9, (a4)
 ; V-NEXT:    addi a0, a0, 32
+; V-NEXT:    vse64.v v9, (a4)
 ; V-NEXT:    addi a1, a1, 160
 ; V-NEXT:    bne a0, a2, .LBB12_1
 ; V-NEXT:  # %bb.2: # %bb18
@@ -653,15 +653,16 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-LABEL: gather_of_pointers:
 ; ZVE32F:       # %bb.0: # %bb
 ; ZVE32F-NEXT:    li a2, 0
-; ZVE32F-NEXT:    lui a3, 2
-; ZVE32F-NEXT:    add a3, a0, a3
-; ZVE32F-NEXT:    li a4, 1
+; ZVE32F-NEXT:    lui a4, 2
+; ZVE32F-NEXT:    li a3, 1
+; ZVE32F-NEXT:    add a4, a0, a4
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    mul a6, a4, a5
-; ZVE32F-NEXT:    add a6, a1, a6
+; ZVE32F-NEXT:    mul a6, a3, a5
 ; ZVE32F-NEXT:    mul a7, a2, a5
+; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    add a6, a1, a6
 ; ZVE32F-NEXT:    add a7, a1, a7
 ; ZVE32F-NEXT:    ld t0, 0(a7)
 ; ZVE32F-NEXT:    ld t1, 0(a6)
@@ -671,10 +672,9 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    sd t1, 8(a0)
 ; ZVE32F-NEXT:    sd a7, 16(a0)
 ; ZVE32F-NEXT:    sd a6, 24(a0)
-; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a0, a0, 32
-; ZVE32F-NEXT:    addi a4, a4, 4
-; ZVE32F-NEXT:    bne a0, a3, .LBB12_1
+; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    bne a0, a4, .LBB12_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
 bb:
@@ -719,9 +719,9 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; V-NEXT:    vle64.v v8, (a1)
 ; V-NEXT:    vle64.v v9, (a4)
 ; V-NEXT:    addi a4, a0, 80
+; V-NEXT:    addi a1, a1, 32
 ; V-NEXT:    vsse64.v v8, (a0), a3
 ; V-NEXT:    vsse64.v v9, (a4), a3
-; V-NEXT:    addi a1, a1, 32
 ; V-NEXT:    addi a0, a0, 160
 ; V-NEXT:    bne a1, a2, .LBB13_1
 ; V-NEXT:  # %bb.2: # %bb18
@@ -730,9 +730,9 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-LABEL: scatter_of_pointers:
 ; ZVE32F:       # %bb.0: # %bb
 ; ZVE32F-NEXT:    li a2, 0
-; ZVE32F-NEXT:    lui a3, 2
-; ZVE32F-NEXT:    add a3, a1, a3
-; ZVE32F-NEXT:    li a4, 1
+; ZVE32F-NEXT:    lui a4, 2
+; ZVE32F-NEXT:    li a3, 1
+; ZVE32F-NEXT:    add a4, a1, a4
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB13_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -740,18 +740,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    ld a7, 8(a1)
 ; ZVE32F-NEXT:    ld t0, 16(a1)
 ; ZVE32F-NEXT:    ld t1, 24(a1)
-; ZVE32F-NEXT:    mul t2, a4, a5
-; ZVE32F-NEXT:    add t2, a0, t2
+; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    mul t3, a2, a5
+; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    addi a1, a1, 32
+; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    add t3, a0, t3
 ; ZVE32F-NEXT:    sd a6, 0(t3)
 ; ZVE32F-NEXT:    sd a7, 0(t2)
 ; ZVE32F-NEXT:    sd t0, 80(t3)
 ; ZVE32F-NEXT:    sd t1, 80(t2)
-; ZVE32F-NEXT:    addi a2, a2, 4
-; ZVE32F-NEXT:    addi a1, a1, 32
-; ZVE32F-NEXT:    addi a4, a4, 4
-; ZVE32F-NEXT:    bne a1, a3, .LBB13_1
+; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    bne a1, a4, .LBB13_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
 bb:
@@ -794,36 +794,36 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    mv a4, a2
 ; CHECK-NEXT:    bltu a5, a6, .LBB14_5
 ; CHECK-NEXT:  # %bb.2: # %bb9
-; CHECK-NEXT:    slli a5, a5, 32
-; CHECK-NEXT:    srli a5, a5, 32
-; CHECK-NEXT:    addi a5, a5, 1
-; CHECK-NEXT:    andi a6, a5, -32
-; CHECK-NEXT:    add a4, a6, a2
+; CHECK-NEXT:    slli a4, a5, 32
 ; CHECK-NEXT:    slli t0, a2, 2
-; CHECK-NEXT:    add a7, a0, a2
-; CHECK-NEXT:    add a2, a1, a2
-; CHECK-NEXT:    add a2, a2, t0
-; CHECK-NEXT:    add t0, a4, a0
+; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    add a6, a1, a2
 ; CHECK-NEXT:    li t2, 32
+; CHECK-NEXT:    srli a4, a4, 32
+; CHECK-NEXT:    add t0, a6, t0
+; CHECK-NEXT:    addi a6, a4, 1
+; CHECK-NEXT:    andi a7, a6, -32
+; CHECK-NEXT:    add a4, a7, a2
+; CHECK-NEXT:    add a2, a4, a0
 ; CHECK-NEXT:    li t1, 5
 ; CHECK-NEXT:    vsetvli zero, t2, e8, m1, ta, ma
 ; CHECK-NEXT:  .LBB14_3: # %bb15
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vlse8.v v8, (a2), t1
-; CHECK-NEXT:    vle8.v v9, (a7)
+; CHECK-NEXT:    vlse8.v v8, (t0), t1
+; CHECK-NEXT:    vle8.v v9, (a5)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vse8.v v8, (a7)
-; CHECK-NEXT:    addi a7, a7, 32
-; CHECK-NEXT:    addi a2, a2, 160
-; CHECK-NEXT:    bne a7, t0, .LBB14_3
+; CHECK-NEXT:    vse8.v v8, (a5)
+; CHECK-NEXT:    addi a5, a5, 32
+; CHECK-NEXT:    addi t0, t0, 160
+; CHECK-NEXT:    bne a5, a2, .LBB14_3
 ; CHECK-NEXT:  # %bb.4: # %bb30
-; CHECK-NEXT:    beq a5, a6, .LBB14_7
+; CHECK-NEXT:    beq a6, a7, .LBB14_7
 ; CHECK-NEXT:  .LBB14_5: # %bb32
 ; CHECK-NEXT:    add a2, a0, a4
 ; CHECK-NEXT:    slli a5, a4, 2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    subw a3, a3, a4
+; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    srli a3, a3, 32
 ; CHECK-NEXT:    add a0, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 98c0020e500046..1c2c90478a1f77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -550,12 +550,12 @@ define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    addi a5, a2, -16
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 2
+; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    sltu a2, a2, a5
+; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -578,8 +578,8 @@ define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    addi a5, a2, -16
+; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sltu a2, a2, a5
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
@@ -637,13 +637,13 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v8, (a1), a2, v0.t
+; CHECK-RV32-NEXT:    addi a1, a0, 128
+; CHECK-RV32-NEXT:    addi a2, a0, 256
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vse64.v v8, (a0)
-; CHECK-RV32-NEXT:    addi a1, a0, 128
 ; CHECK-RV32-NEXT:    vse64.v v24, (a1)
-; CHECK-RV32-NEXT:    addi a0, a0, 256
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vse64.v v16, (a0)
+; CHECK-RV32-NEXT:    vse64.v v16, (a2)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: strided_load_v33f64:
@@ -687,13 +687,13 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v8, (a1), a2, v0.t
+; CHECK-RV64-NEXT:    addi a1, a0, 128
+; CHECK-RV64-NEXT:    addi a2, a0, 256
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vse64.v v8, (a0)
-; CHECK-RV64-NEXT:    addi a1, a0, 128
 ; CHECK-RV64-NEXT:    vse64.v v24, (a1)
-; CHECK-RV64-NEXT:    addi a0, a0, 256
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vse64.v v16, (a0)
+; CHECK-RV64-NEXT:    vse64.v v16, (a2)
 ; CHECK-RV64-NEXT:    ret
   %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
   ret <33 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index 0e950c054c924e..6648fa7fbe30fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -68,11 +68,11 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero
 ; CHECK-NEXT:    sltu a0, a0, a2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v24, a1
 ; CHECK-NEXT:    ret
   %v = call <128 x i7> @llvm.vp.trunc.v128i7.v128i16(<128 x i16> %a, <128 x i1> %m, i32 %vl)
@@ -246,17 +246,17 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v25, v0, 8
 ; CHECK-NEXT:    addi a2, a1, 512
+; CHECK-NEXT:    addi a3, a1, 640
+; CHECK-NEXT:    addi a4, a7, -64
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v27, v25, 4
-; CHECK-NEXT:    addi a3, a1, 640
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a7, a4
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v27, 2
-; CHECK-NEXT:    addi a3, a7, -64
-; CHECK-NEXT:    sltu a4, a7, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a4, a4, a3
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a4, a3, a4
 ; CHECK-NEXT:    addi a3, a4, -32
 ; CHECK-NEXT:    sltu a5, a4, a3
 ; CHECK-NEXT:    addi a5, a5, -1
@@ -545,11 +545,11 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v24, 16
 ; CHECK-NEXT:    ret
   %v = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> %a, <32 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 293b75dc207c86..db03dc3d5ab1e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -313,10 +313,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m)
 ; RV32-SLOW-NEXT:  .LBB6_8: # %cond.store5
 ; RV32-SLOW-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32-SLOW-NEXT:    vslidedown.vi v9, v9, 3
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vmv.x.s a0, v8
 ; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-SLOW-NEXT:    vslidedown.vi v8, v9, 3
-; RV32-SLOW-NEXT:    vmv.x.s a1, v8
+; RV32-SLOW-NEXT:    vmv.x.s a1, v9
 ; RV32-SLOW-NEXT:    srli a2, a0, 8
 ; RV32-SLOW-NEXT:    sb a0, 0(a1)
 ; RV32-SLOW-NEXT:    sb a2, 1(a1)
@@ -376,10 +378,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m)
 ; RV64-SLOW-NEXT:  .LBB6_8: # %cond.store5
 ; RV64-SLOW-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-SLOW-NEXT:    vslidedown.vi v10, v10, 3
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-SLOW-NEXT:    vslidedown.vi v8, v10, 3
-; RV64-SLOW-NEXT:    vmv.x.s a1, v8
+; RV64-SLOW-NEXT:    vmv.x.s a1, v10
 ; RV64-SLOW-NEXT:    srli a2, a0, 8
 ; RV64-SLOW-NEXT:    sb a0, 0(a1)
 ; RV64-SLOW-NEXT:    sb a2, 1(a1)
@@ -426,9 +430,9 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m)
 ; RV32-SLOW-NEXT:  .LBB7_4: # %cond.store1
 ; RV32-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-SLOW-NEXT:    vslidedown.vi v9, v9, 1
 ; RV32-SLOW-NEXT:    vmv.x.s a0, v8
-; RV32-SLOW-NEXT:    vslidedown.vi v8, v9, 1
-; RV32-SLOW-NEXT:    vmv.x.s a1, v8
+; RV32-SLOW-NEXT:    vmv.x.s a1, v9
 ; RV32-SLOW-NEXT:    srli a2, a0, 16
 ; RV32-SLOW-NEXT:    sh a0, 0(a1)
 ; RV32-SLOW-NEXT:    sh a2, 2(a1)
@@ -458,10 +462,12 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m)
 ; RV64-SLOW-NEXT:  .LBB7_4: # %cond.store1
 ; RV64-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-SLOW-NEXT:    vslidedown.vi v9, v9, 1
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64-SLOW-NEXT:    vslidedown.vi v8, v9, 1
-; RV64-SLOW-NEXT:    vmv.x.s a1, v8
+; RV64-SLOW-NEXT:    vmv.x.s a1, v9
 ; RV64-SLOW-NEXT:    srli a2, a0, 16
 ; RV64-SLOW-NEXT:    sh a0, 0(a1)
 ; RV64-SLOW-NEXT:    sh a2, 2(a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
index 4e9862b05f4089..dfd509062ccf7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
@@ -57,8 +57,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -73,8 +73,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -102,8 +102,8 @@ define <1 x i1> @fcmp_oge_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -118,8 +118,8 @@ define <1 x i1> @fcmp_oge_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -147,8 +147,8 @@ define <1 x i1> @fcmp_olt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -163,8 +163,8 @@ define <1 x i1> @fcmp_olt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -192,8 +192,8 @@ define <1 x i1> @fcmp_ole_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -208,8 +208,8 @@ define <1 x i1> @fcmp_ole_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -240,8 +240,8 @@ define <1 x i1> @fcmp_one_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -259,8 +259,8 @@ define <1 x i1> @fcmp_one_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -336,8 +336,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -355,8 +355,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -388,8 +388,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -405,8 +405,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -436,8 +436,8 @@ define <1 x i1> @fcmp_uge_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -453,8 +453,8 @@ define <1 x i1> @fcmp_uge_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -484,8 +484,8 @@ define <1 x i1> @fcmp_ult_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -501,8 +501,8 @@ define <1 x i1> @fcmp_ult_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -532,8 +532,8 @@ define <1 x i1> @fcmp_ule_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -549,8 +549,8 @@ define <1 x i1> @fcmp_ule_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -690,8 +690,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -706,8 +706,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -735,8 +735,8 @@ define <2 x i1> @fcmp_oge_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -751,8 +751,8 @@ define <2 x i1> @fcmp_oge_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -780,8 +780,8 @@ define <2 x i1> @fcmp_olt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -796,8 +796,8 @@ define <2 x i1> @fcmp_olt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -825,8 +825,8 @@ define <2 x i1> @fcmp_ole_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -841,8 +841,8 @@ define <2 x i1> @fcmp_ole_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -873,8 +873,8 @@ define <2 x i1> @fcmp_one_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -892,8 +892,8 @@ define <2 x i1> @fcmp_one_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -969,8 +969,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -988,8 +988,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1021,8 +1021,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1038,8 +1038,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1069,8 +1069,8 @@ define <2 x i1> @fcmp_uge_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1086,8 +1086,8 @@ define <2 x i1> @fcmp_uge_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1117,8 +1117,8 @@ define <2 x i1> @fcmp_ult_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1134,8 +1134,8 @@ define <2 x i1> @fcmp_ult_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1165,8 +1165,8 @@ define <2 x i1> @fcmp_ule_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1182,8 +1182,8 @@ define <2 x i1> @fcmp_ule_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1323,8 +1323,8 @@ define <4 x i1> @fcmp_ogt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1339,8 +1339,8 @@ define <4 x i1> @fcmp_ogt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1368,8 +1368,8 @@ define <4 x i1> @fcmp_oge_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1384,8 +1384,8 @@ define <4 x i1> @fcmp_oge_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1413,8 +1413,8 @@ define <4 x i1> @fcmp_olt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1429,8 +1429,8 @@ define <4 x i1> @fcmp_olt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1458,8 +1458,8 @@ define <4 x i1> @fcmp_ole_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1474,8 +1474,8 @@ define <4 x i1> @fcmp_ole_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1506,8 +1506,8 @@ define <4 x i1> @fcmp_one_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -1525,8 +1525,8 @@ define <4 x i1> @fcmp_one_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1602,8 +1602,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -1621,8 +1621,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1654,8 +1654,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1671,8 +1671,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1702,8 +1702,8 @@ define <4 x i1> @fcmp_uge_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1719,8 +1719,8 @@ define <4 x i1> @fcmp_uge_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1750,8 +1750,8 @@ define <4 x i1> @fcmp_ult_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1767,8 +1767,8 @@ define <4 x i1> @fcmp_ult_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1798,8 +1798,8 @@ define <4 x i1> @fcmp_ule_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1815,8 +1815,8 @@ define <4 x i1> @fcmp_ule_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1956,8 +1956,8 @@ define <8 x i1> @fcmp_ogt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1972,8 +1972,8 @@ define <8 x i1> @fcmp_ogt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2001,8 +2001,8 @@ define <8 x i1> @fcmp_oge_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2017,8 +2017,8 @@ define <8 x i1> @fcmp_oge_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2046,8 +2046,8 @@ define <8 x i1> @fcmp_olt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2062,8 +2062,8 @@ define <8 x i1> @fcmp_olt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2091,8 +2091,8 @@ define <8 x i1> @fcmp_ole_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2107,8 +2107,8 @@ define <8 x i1> @fcmp_ole_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2139,8 +2139,8 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -2158,8 +2158,8 @@ define <8 x i1> @fcmp_one_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -2235,8 +2235,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -2254,8 +2254,8 @@ define <8 x i1> @fcmp_ueq_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -2287,8 +2287,8 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2304,8 +2304,8 @@ define <8 x i1> @fcmp_ugt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2335,8 +2335,8 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2352,8 +2352,8 @@ define <8 x i1> @fcmp_uge_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2383,8 +2383,8 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2400,8 +2400,8 @@ define <8 x i1> @fcmp_ult_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2431,8 +2431,8 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2448,8 +2448,8 @@ define <8 x i1> @fcmp_ule_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2797,9 +2797,9 @@ define <16 x i1> @fcmp_one_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -2817,9 +2817,9 @@ define <16 x i1> @fcmp_one_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -2896,9 +2896,9 @@ define <16 x i1> @fcmp_ueq_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -2916,9 +2916,9 @@ define <16 x i1> @fcmp_ueq_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -3981,8 +3981,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -3997,8 +3997,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4026,8 +4026,8 @@ define <1 x i1> @fcmp_oge_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4042,8 +4042,8 @@ define <1 x i1> @fcmp_oge_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4071,8 +4071,8 @@ define <1 x i1> @fcmp_olt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4087,8 +4087,8 @@ define <1 x i1> @fcmp_olt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4116,8 +4116,8 @@ define <1 x i1> @fcmp_ole_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4132,8 +4132,8 @@ define <1 x i1> @fcmp_ole_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4164,8 +4164,8 @@ define <1 x i1> @fcmp_one_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4183,8 +4183,8 @@ define <1 x i1> @fcmp_one_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4260,8 +4260,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4279,8 +4279,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4312,8 +4312,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4329,8 +4329,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4360,8 +4360,8 @@ define <1 x i1> @fcmp_uge_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4377,8 +4377,8 @@ define <1 x i1> @fcmp_uge_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4408,8 +4408,8 @@ define <1 x i1> @fcmp_ult_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4425,8 +4425,8 @@ define <1 x i1> @fcmp_ult_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4456,8 +4456,8 @@ define <1 x i1> @fcmp_ule_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4473,8 +4473,8 @@ define <1 x i1> @fcmp_ule_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4614,8 +4614,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4630,8 +4630,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4659,8 +4659,8 @@ define <2 x i1> @fcmp_oge_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4675,8 +4675,8 @@ define <2 x i1> @fcmp_oge_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4704,8 +4704,8 @@ define <2 x i1> @fcmp_olt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4720,8 +4720,8 @@ define <2 x i1> @fcmp_olt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4749,8 +4749,8 @@ define <2 x i1> @fcmp_ole_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4765,8 +4765,8 @@ define <2 x i1> @fcmp_ole_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4797,8 +4797,8 @@ define <2 x i1> @fcmp_one_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4816,8 +4816,8 @@ define <2 x i1> @fcmp_one_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4893,8 +4893,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4912,8 +4912,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4945,8 +4945,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4962,8 +4962,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4993,8 +4993,8 @@ define <2 x i1> @fcmp_uge_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5010,8 +5010,8 @@ define <2 x i1> @fcmp_uge_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5041,8 +5041,8 @@ define <2 x i1> @fcmp_ult_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5058,8 +5058,8 @@ define <2 x i1> @fcmp_ult_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5089,8 +5089,8 @@ define <2 x i1> @fcmp_ule_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5106,8 +5106,8 @@ define <2 x i1> @fcmp_ule_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5247,8 +5247,8 @@ define <4 x i1> @fcmp_ogt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5263,8 +5263,8 @@ define <4 x i1> @fcmp_ogt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5292,8 +5292,8 @@ define <4 x i1> @fcmp_oge_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5308,8 +5308,8 @@ define <4 x i1> @fcmp_oge_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5337,8 +5337,8 @@ define <4 x i1> @fcmp_olt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5353,8 +5353,8 @@ define <4 x i1> @fcmp_olt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5382,8 +5382,8 @@ define <4 x i1> @fcmp_ole_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5398,8 +5398,8 @@ define <4 x i1> @fcmp_ole_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5430,8 +5430,8 @@ define <4 x i1> @fcmp_one_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -5449,8 +5449,8 @@ define <4 x i1> @fcmp_one_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -5526,8 +5526,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -5545,8 +5545,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -5578,8 +5578,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5595,8 +5595,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5626,8 +5626,8 @@ define <4 x i1> @fcmp_uge_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5643,8 +5643,8 @@ define <4 x i1> @fcmp_uge_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5674,8 +5674,8 @@ define <4 x i1> @fcmp_ult_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5691,8 +5691,8 @@ define <4 x i1> @fcmp_ult_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5722,8 +5722,8 @@ define <4 x i1> @fcmp_ule_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5739,8 +5739,8 @@ define <4 x i1> @fcmp_ule_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -6088,9 +6088,9 @@ define <8 x i1> @fcmp_one_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -6108,9 +6108,9 @@ define <8 x i1> @fcmp_one_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -6187,9 +6187,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -6207,9 +6207,9 @@ define <8 x i1> @fcmp_ueq_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -7230,8 +7230,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7246,8 +7246,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7275,8 +7275,8 @@ define <1 x i1> @fcmp_oge_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7291,8 +7291,8 @@ define <1 x i1> @fcmp_oge_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7320,8 +7320,8 @@ define <1 x i1> @fcmp_olt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7336,8 +7336,8 @@ define <1 x i1> @fcmp_olt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7365,8 +7365,8 @@ define <1 x i1> @fcmp_ole_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7381,8 +7381,8 @@ define <1 x i1> @fcmp_ole_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7413,8 +7413,8 @@ define <1 x i1> @fcmp_one_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -7432,8 +7432,8 @@ define <1 x i1> @fcmp_one_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -7509,8 +7509,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -7528,8 +7528,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -7561,8 +7561,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7578,8 +7578,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7609,8 +7609,8 @@ define <1 x i1> @fcmp_uge_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7626,8 +7626,8 @@ define <1 x i1> @fcmp_uge_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7657,8 +7657,8 @@ define <1 x i1> @fcmp_ult_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7674,8 +7674,8 @@ define <1 x i1> @fcmp_ult_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7705,8 +7705,8 @@ define <1 x i1> @fcmp_ule_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7722,8 +7722,8 @@ define <1 x i1> @fcmp_ule_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7863,8 +7863,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7879,8 +7879,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7908,8 +7908,8 @@ define <2 x i1> @fcmp_oge_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7924,8 +7924,8 @@ define <2 x i1> @fcmp_oge_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7953,8 +7953,8 @@ define <2 x i1> @fcmp_olt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7969,8 +7969,8 @@ define <2 x i1> @fcmp_olt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7998,8 +7998,8 @@ define <2 x i1> @fcmp_ole_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -8014,8 +8014,8 @@ define <2 x i1> @fcmp_ole_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -8046,8 +8046,8 @@ define <2 x i1> @fcmp_one_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -8065,8 +8065,8 @@ define <2 x i1> @fcmp_one_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -8142,8 +8142,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -8161,8 +8161,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -8194,8 +8194,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8211,8 +8211,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8242,8 +8242,8 @@ define <2 x i1> @fcmp_uge_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8259,8 +8259,8 @@ define <2 x i1> @fcmp_uge_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8290,8 +8290,8 @@ define <2 x i1> @fcmp_ult_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8307,8 +8307,8 @@ define <2 x i1> @fcmp_ult_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8338,8 +8338,8 @@ define <2 x i1> @fcmp_ule_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8355,8 +8355,8 @@ define <2 x i1> @fcmp_ule_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8704,9 +8704,9 @@ define <4 x i1> @fcmp_one_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -8724,9 +8724,9 @@ define <4 x i1> @fcmp_one_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -8803,9 +8803,9 @@ define <4 x i1> @fcmp_ueq_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -8823,9 +8823,9 @@ define <4 x i1> @fcmp_ueq_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
index 97641ff6d92d70..472f2073667dbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
@@ -509,8 +509,8 @@ define <1 x i1> @fcmps_uno_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -1041,8 +1041,8 @@ define <2 x i1> @fcmps_uno_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -1573,8 +1573,8 @@ define <4 x i1> @fcmps_uno_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -2105,8 +2105,8 @@ define <8 x i1> @fcmps_uno_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -2637,10 +2637,10 @@ define <16 x i1> @fcmps_uno_vf_v16f16(<16 x half> %va, half %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -3210,10 +3210,10 @@ define <32 x i1> @fcmps_uno_vf_v32f16(<32 x half> %va, half %b) nounwind strictf
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3743,8 +3743,8 @@ define <1 x i1> @fcmps_uno_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -4275,8 +4275,8 @@ define <2 x i1> @fcmps_uno_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -4807,8 +4807,8 @@ define <4 x i1> @fcmps_uno_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -5339,10 +5339,10 @@ define <8 x i1> @fcmps_uno_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -5871,10 +5871,10 @@ define <16 x i1> @fcmps_uno_vf_v16f32(<16 x float> %va, float %b) nounwind stric
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6403,8 +6403,8 @@ define <1 x i1> @fcmps_uno_vf_v1f64(<1 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -6935,8 +6935,8 @@ define <2 x i1> @fcmps_uno_vf_v2f64(<2 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -7467,10 +7467,10 @@ define <4 x i1> @fcmps_uno_vf_v4f64(<4 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -7999,10 +7999,10 @@ define <8 x i1> @fcmps_uno_vf_v8f64(<8 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index 35ce42ec841dc8..c61f9cd9b5bd77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -66,12 +66,12 @@ define <2 x half> @vfma_vf_v2f16(<2 x half> %va, half %b, <2 x half> %vc, <2 x i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -92,12 +92,12 @@ define <2 x half> @vfma_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -165,12 +165,12 @@ define <4 x half> @vfma_vf_v4f16(<4 x half> %va, half %b, <4 x half> %vc, <4 x i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -191,12 +191,12 @@ define <4 x half> @vfma_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -264,14 +264,14 @@ define <8 x half> @vfma_vf_v8f16(<8 x half> %va, half %b, <8 x half> %vc, <8 x i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -290,14 +290,14 @@ define <8 x half> @vfma_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -363,14 +363,14 @@ define <16 x half> @vfma_vf_v16f16(<16 x half> %va, half %b, <16 x half> %vc, <1
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -389,14 +389,14 @@ define <16 x half> @vfma_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -828,23 +828,24 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
@@ -854,9 +855,17 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
@@ -870,19 +879,19 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
@@ -901,48 +910,60 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v0, v8
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vle64.v v16, (a2)
-; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:    bltu a4, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v0, v16
+; CHECK-NEXT:    vfmadd.vv v0, v8, v16
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v0
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index 18abded9ea8b93..a5d9b3439e29bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -616,23 +616,24 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
@@ -642,9 +643,17 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
@@ -658,19 +667,19 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    vmv.v.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
@@ -689,48 +698,60 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v0, v8
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a2, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vle64.v v16, (a2)
-; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:    bltu a4, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v0, v16
+; CHECK-NEXT:    vfmadd.vv v0, v8, v16
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v16, v0
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8
+; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 1144f776e7fbfc..aba9056c78cdaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -118,11 +118,11 @@ define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; FOLDING-NEXT:    vfwmul.vv v12, v8, v9
+; FOLDING-NEXT:    vfwsub.vv v13, v9, v10
 ; FOLDING-NEXT:    vfwmacc.vv v11, v8, v10
-; FOLDING-NEXT:    vfwsub.vv v8, v9, v10
 ; FOLDING-NEXT:    vse64.v v12, (a0)
 ; FOLDING-NEXT:    vse64.v v11, (a1)
-; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    vse64.v v13, (a2)
 ; FOLDING-NEXT:    ret
   %c = fpext <2 x float> %a to <2 x double>
   %d = fpext <2 x float> %b to <2 x double>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 88ae643ca742e7..3ee2e883072aab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -296,13 +296,13 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %
 ; RV64-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
+; RV64-NEXT:    sltu a1, a1, a2
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
@@ -2052,11 +2052,11 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v24, (zero), v8, v0.t
 ; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2103,11 +2103,11 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2119,10 +2119,10 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsext.vf8 v24, v8
 ; RV64-NEXT:    li a3, 16
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB95_2
@@ -2161,11 +2161,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2178,10 +2178,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vsext.vf8 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB96_2
@@ -2221,11 +2221,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2247,11 +2247,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a1, a1, a2
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2279,11 +2279,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2295,10 +2295,10 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    li a3, 16
+; RV64-NEXT:    vsext.vf4 v16, v12
+; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB98_2
@@ -2337,11 +2337,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2354,10 +2354,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB99_2
@@ -2397,11 +2397,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2423,11 +2423,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a1, a1, a2
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2454,11 +2454,11 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2470,10 +2470,10 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
 ; RV64-NEXT:    vsext.vf2 v24, v8
 ; RV64-NEXT:    li a3, 16
+; RV64-NEXT:    vsext.vf2 v8, v16
+; RV64-NEXT:    vsll.vi v16, v8, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB101_2
@@ -2511,11 +2511,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2528,10 +2528,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    vsext.vf2 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf2 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB102_2
@@ -2570,11 +2570,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2587,10 +2587,10 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    vzext.vf2 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vzext.vf2 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB103_2
@@ -2622,17 +2622,17 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    vnsrl.wi v24, v16, 0
 ; RV32-NEXT:    vnsrl.wi v16, v8, 0
 ; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    addi a3, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vslideup.vi v16, v24, 16
 ; RV32-NEXT:    vsll.vi v24, v16, 3
+; RV32-NEXT:    sltu a2, a1, a3
+; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a3, a1, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    li a2, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index a3cd46e485c7cb..71f497e4c7be48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -374,11 +374,11 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
@@ -402,11 +402,11 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    addi a4, a3, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    sltu a3, a3, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
@@ -431,13 +431,13 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a1), v0.t
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    addi a2, a0, 256
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vse64.v v16, (a1)
-; CHECK-NEXT:    addi a0, a0, 256
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vse64.v v24, (a0)
+; CHECK-NEXT:    vse64.v v24, (a2)
 ; CHECK-NEXT:    ret
   %load = call <33 x double> @llvm.vp.load.v33f64.p0(ptr %ptr, <33 x i1> %m, i32 %evl)
   ret <33 x double> %load
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index ad13603ee13ec7..d691dcd5c54b6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1756,11 +1756,11 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    addi a0, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a0
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1826,11 +1826,11 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1854,15 +1854,15 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v0, (a1)
+; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v0, 16
+; RV64-NEXT:    vslidedown.vi v16, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf2 v0, v24
 ; RV64-NEXT:    vsext.vf2 v24, v16
 ; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsext.vf2 v24, v0
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v24, 3
+; RV64-NEXT:    vsll.vi v24, v0, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB84_2
 ; RV64-NEXT:  # %bb.1:
@@ -1913,11 +1913,11 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1929,37 +1929,37 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v16, (a1)
+; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v0, v16
+; RV64-NEXT:    vsext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v16, 16
+; RV64-NEXT:    vslidedown.vi v8, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vsext.vf2 v24, v8
+; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB85_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB85_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -1967,14 +1967,14 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -2002,11 +2002,11 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2018,37 +2018,37 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v16, (a1)
+; RV64-NEXT:    vle32.v v24, (a1)
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v0, v16
+; RV64-NEXT:    vzext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v16, 16
+; RV64-NEXT:    vslidedown.vi v8, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vzext.vf2 v24, v8
+; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB86_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB86_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -2056,14 +2056,14 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index d34292abdce0d5..8eaa5efe163cdd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -292,11 +292,11 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
index 418b159c8fb98d..5975b0d0761ebb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
@@ -11,8 +11,8 @@ define <1 x i8> @vrol_vv_v1i8(<1 x i8> %a, <1 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -33,8 +33,8 @@ define <1 x i8> @vrol_vx_v1i8(<1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -58,8 +58,8 @@ define <2 x i8> @vrol_vv_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -80,8 +80,8 @@ define <2 x i8> @vrol_vx_v2i8(<2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -105,8 +105,8 @@ define <4 x i8> @vrol_vv_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -127,8 +127,8 @@ define <4 x i8> @vrol_vx_v4i8(<4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -152,8 +152,8 @@ define <8 x i8> @vrol_vv_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -174,8 +174,8 @@ define <8 x i8> @vrol_vx_v8i8(<8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -199,8 +199,8 @@ define <16 x i8> @vrol_vv_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -221,8 +221,8 @@ define <16 x i8> @vrol_vx_v16i8(<16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -247,8 +247,8 @@ define <32 x i8> @vrol_vv_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -271,8 +271,8 @@ define <32 x i8> @vrol_vx_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -298,8 +298,8 @@ define <64 x i8> @vrol_vv_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -322,8 +322,8 @@ define <64 x i8> @vrol_vx_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -348,8 +348,8 @@ define <1 x i16> @vrol_vv_v1i16(<1 x i16> %a, <1 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -370,8 +370,8 @@ define <1 x i16> @vrol_vx_v1i16(<1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -395,8 +395,8 @@ define <2 x i16> @vrol_vv_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -417,8 +417,8 @@ define <2 x i16> @vrol_vx_v2i16(<2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -442,8 +442,8 @@ define <4 x i16> @vrol_vv_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -464,8 +464,8 @@ define <4 x i16> @vrol_vx_v4i16(<4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -489,8 +489,8 @@ define <8 x i16> @vrol_vv_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -511,8 +511,8 @@ define <8 x i16> @vrol_vx_v8i16(<8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -536,8 +536,8 @@ define <16 x i16> @vrol_vv_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -558,8 +558,8 @@ define <16 x i16> @vrol_vx_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -584,8 +584,8 @@ define <32 x i16> @vrol_vv_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -608,8 +608,8 @@ define <32 x i16> @vrol_vx_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -634,12 +634,12 @@ define <1 x i32> @vrol_vv_v1i32(<1 x i32> %a, <1 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v1i32:
@@ -658,8 +658,8 @@ define <1 x i32> @vrol_vx_v1i32(<1 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -683,12 +683,12 @@ define <2 x i32> @vrol_vv_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v2i32:
@@ -707,8 +707,8 @@ define <2 x i32> @vrol_vx_v2i32(<2 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -732,12 +732,12 @@ define <4 x i32> @vrol_vv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v4i32:
@@ -756,8 +756,8 @@ define <4 x i32> @vrol_vx_v4i32(<4 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -781,12 +781,12 @@ define <8 x i32> @vrol_vv_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v8i32:
@@ -805,8 +805,8 @@ define <8 x i32> @vrol_vx_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -830,12 +830,12 @@ define <16 x i32> @vrol_vv_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v16i32:
@@ -854,8 +854,8 @@ define <16 x i32> @vrol_vx_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -879,12 +879,12 @@ define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v1i64:
@@ -903,8 +903,8 @@ define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -928,12 +928,12 @@ define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v2i64:
@@ -955,9 +955,9 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vwsub.vx v11, v10, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vand.vx v10, v11, a0
 ; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vsll.vv v8, v8, v9
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
@@ -968,8 +968,8 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; RV64-NEXT:    vmv.v.x v9, a0
 ; RV64-NEXT:    li a0, 63
 ; RV64-NEXT:    vand.vx v10, v9, a0
-; RV64-NEXT:    vsll.vv v10, v8, v10
 ; RV64-NEXT:    vrsub.vi v9, v9, 0
+; RV64-NEXT:    vsll.vv v10, v8, v10
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsrl.vv v8, v8, v9
 ; RV64-NEXT:    vor.vv v8, v10, v8
@@ -993,12 +993,12 @@ define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v4i64:
@@ -1020,9 +1020,9 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vwsub.vx v14, v12, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vx v10, v10, a0
 ; RV32-NEXT:    vand.vx v12, v14, a0
 ; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vand.vx v10, v10, a0
 ; RV32-NEXT:    vsll.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
@@ -1033,8 +1033,8 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    li a0, 63
 ; RV64-NEXT:    vand.vx v12, v10, a0
-; RV64-NEXT:    vsll.vv v12, v8, v12
 ; RV64-NEXT:    vrsub.vi v10, v10, 0
+; RV64-NEXT:    vsll.vv v12, v8, v12
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsrl.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v12, v8
@@ -1058,12 +1058,12 @@ define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v8i64:
@@ -1085,9 +1085,9 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vwsub.vx v20, v16, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vand.vx v16, v20, a0
 ; RV32-NEXT:    vsrl.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vsll.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    ret
@@ -1098,8 +1098,8 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; RV64-NEXT:    vmv.v.x v12, a0
 ; RV64-NEXT:    li a0, 63
 ; RV64-NEXT:    vand.vx v16, v12, a0
-; RV64-NEXT:    vsll.vv v16, v8, v16
 ; RV64-NEXT:    vrsub.vi v12, v12, 0
+; RV64-NEXT:    vsll.vv v16, v8, v16
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsrl.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v16, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
index e4ddfeb4c4195a..68a9e217ccd1ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
@@ -12,8 +12,8 @@ define <1 x i8> @vror_vv_v1i8(<1 x i8> %a, <1 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -34,8 +34,8 @@ define <1 x i8> @vror_vx_v1i8(<1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -96,8 +96,8 @@ define <2 x i8> @vror_vv_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -118,8 +118,8 @@ define <2 x i8> @vror_vx_v2i8(<2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -180,8 +180,8 @@ define <4 x i8> @vror_vv_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -202,8 +202,8 @@ define <4 x i8> @vror_vx_v4i8(<4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -264,8 +264,8 @@ define <8 x i8> @vror_vv_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -286,8 +286,8 @@ define <8 x i8> @vror_vx_v8i8(<8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -348,8 +348,8 @@ define <16 x i8> @vror_vv_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -370,8 +370,8 @@ define <16 x i8> @vror_vx_v16i8(<16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -433,8 +433,8 @@ define <32 x i8> @vror_vv_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -457,8 +457,8 @@ define <32 x i8> @vror_vx_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -525,8 +525,8 @@ define <64 x i8> @vror_vv_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -549,8 +549,8 @@ define <64 x i8> @vror_vx_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -616,8 +616,8 @@ define <1 x i16> @vror_vv_v1i16(<1 x i16> %a, <1 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -638,8 +638,8 @@ define <1 x i16> @vror_vx_v1i16(<1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -700,8 +700,8 @@ define <2 x i16> @vror_vv_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -722,8 +722,8 @@ define <2 x i16> @vror_vx_v2i16(<2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -784,8 +784,8 @@ define <4 x i16> @vror_vv_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -806,8 +806,8 @@ define <4 x i16> @vror_vx_v4i16(<4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -868,8 +868,8 @@ define <8 x i16> @vror_vv_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -890,8 +890,8 @@ define <8 x i16> @vror_vx_v8i16(<8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -952,8 +952,8 @@ define <16 x i16> @vror_vv_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -974,8 +974,8 @@ define <16 x i16> @vror_vx_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -1037,8 +1037,8 @@ define <32 x i16> @vror_vv_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1061,8 +1061,8 @@ define <32 x i16> @vror_vx_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1128,12 +1128,12 @@ define <1 x i32> @vror_vv_v1i32(<1 x i32> %a, <1 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v1i32:
@@ -1152,8 +1152,8 @@ define <1 x i32> @vror_vx_v1i32(<1 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1214,12 +1214,12 @@ define <2 x i32> @vror_vv_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v2i32:
@@ -1238,8 +1238,8 @@ define <2 x i32> @vror_vx_v2i32(<2 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1300,12 +1300,12 @@ define <4 x i32> @vror_vv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v4i32:
@@ -1324,8 +1324,8 @@ define <4 x i32> @vror_vx_v4i32(<4 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1386,12 +1386,12 @@ define <8 x i32> @vror_vv_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v8i32:
@@ -1410,8 +1410,8 @@ define <8 x i32> @vror_vx_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -1472,12 +1472,12 @@ define <16 x i32> @vror_vv_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vsll.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v16i32:
@@ -1496,8 +1496,8 @@ define <16 x i32> @vror_vx_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1558,12 +1558,12 @@ define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v1i64:
@@ -1582,8 +1582,8 @@ define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1605,12 +1605,12 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) {
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.i v9, 1
-; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
@@ -1638,12 +1638,12 @@ define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) {
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.i v9, 1
-; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
@@ -1674,12 +1674,12 @@ define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v2i64:
@@ -1701,9 +1701,9 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v11, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vx v10, v11, a0
 ; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
@@ -1714,8 +1714,8 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v9, a0
 ; CHECK-RV64-NEXT:    li a0, 63
 ; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -1741,12 +1741,12 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v9, v10, a0
-; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v9, a0
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
+; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v2i64:
@@ -1776,12 +1776,12 @@ define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v9, v10, a0
-; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v9, a0
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
+; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v2i64:
@@ -1810,12 +1810,12 @@ define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v4i64:
@@ -1837,9 +1837,9 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
@@ -1850,8 +1850,8 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV64-NEXT:    li a0, 63
 ; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
@@ -1877,12 +1877,12 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v10, v12, a0
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v4i64:
@@ -1912,12 +1912,12 @@ define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v10, v12, a0
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64:
@@ -1946,12 +1946,12 @@ define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vsll.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v8i64:
@@ -1973,9 +1973,9 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
@@ -1986,8 +1986,8 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV64-NEXT:    li a0, 63
 ; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
@@ -2013,12 +2013,12 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v16, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v12, v16, a0
-; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vmv.v.x v16, a0
-; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v8i64:
@@ -2048,12 +2048,12 @@ define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v16, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v12, v16, a0
-; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vmv.v.x v16, a0
-; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsrl.vv v16, v8, v16
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index 46e794b8a787ba..888fc79f0122da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -11,13 +11,13 @@ define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    li a0, 63
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmin.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a0, 192
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index add32b61a02628..1d8af4c46cc078 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -158,48 +158,38 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v6, v8
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a3, -128
+; CHECK-NEXT:    vle8.v v8, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a0
-; CHECK-NEXT:    vle8.v v0, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; CHECK-NEXT:    bltu a3, a2, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index a3bba2dd8265c0..557882ee31d4cb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -9,21 +9,21 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    andi a4, a2, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    slli a4, a2, 29
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    slli a4, a2, 27
+; RV32-NEXT:    srli a2, a2, 5
 ; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -39,21 +39,21 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    andi a4, a2, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    slli a4, a2, 61
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    slli a4, a2, 59
+; RV64-NEXT:    srli a2, a2, 5
 ; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -77,21 +77,21 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    andi a4, a2, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    slli a4, a2, 29
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    slli a4, a2, 27
+; RV32-NEXT:    srli a2, a2, 5
 ; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -107,21 +107,21 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    andi a4, a2, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    slli a4, a2, 61
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    slli a4, a2, 59
+; RV64-NEXT:    srli a2, a2, 5
 ; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -146,21 +146,21 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    andi a3, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    slli a3, a1, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    slli a3, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 27
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -176,21 +176,21 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    andi a3, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    slli a3, a1, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    slli a3, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 59
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -214,21 +214,21 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    andi a4, a2, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    slli a4, a2, 29
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    slli a4, a2, 27
+; RV32-NEXT:    srli a2, a2, 5
 ; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -244,21 +244,21 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    andi a4, a2, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    slli a4, a2, 61
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    slli a4, a2, 59
+; RV64-NEXT:    srli a2, a2, 5
 ; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -282,21 +282,21 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    andi a3, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    slli a3, a1, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    slli a3, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 27
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -312,21 +312,21 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    andi a3, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    slli a3, a1, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    slli a3, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 59
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -351,21 +351,21 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    andi a3, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    slli a3, a1, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    slli a3, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 27
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -381,21 +381,21 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    andi a3, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    slli a3, a1, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    slli a3, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 59
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index ddf98bab781413..8fad3db55f9bcd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -11,13 +11,13 @@ define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    li a0, 63
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmin.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a0, 192
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
@@ -386,12 +386,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a3, a0, a3
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index 0728bcf0fda584..ca35aa6c4a94c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -381,12 +381,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a3, a0, a3
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index 808962e0344c6e..e6dfe5e78cdb4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-LABEL: vp_floor_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index c2c2beda94a0be..734dd5e33c4fcb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -144,55 +144,155 @@ define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
 declare <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
 
 define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; CHECK-LABEL: vfmax_nxv32bf16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v0, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v3, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfmax_nxv32bf16_vv:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv8r.v v0, v8
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v0, v3
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmax.vv v16, v0, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFH-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFH-NEXT:    vfmax.vv v16, v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmax.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
   ret <vscale x 32 x bfloat> %v
 }
@@ -395,40 +495,62 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT:    vfmax.vv v8, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -436,7 +558,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -632,21 +755,19 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -669,14 +790,12 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index f85be11fc60f8d..e90d3e3f3e8295 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -45,11 +45,9 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -93,11 +91,9 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
@@ -143,11 +139,9 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -193,11 +187,9 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -264,11 +256,9 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -302,58 +292,64 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -371,32 +367,27 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -451,89 +442,114 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v24, a2
+; CHECK-NEXT:    vslidedown.vx v12, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 24
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v4, v16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v3, v16, v16
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v0, v16
-; CHECK-NEXT:    vmv8r.v v8, v0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
@@ -599,11 +615,9 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -671,11 +685,9 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -745,11 +757,9 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -821,11 +831,9 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -918,11 +926,9 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
@@ -986,58 +992,64 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1055,32 +1067,27 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -1146,89 +1153,114 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    li a4, 24
+; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -1545,61 +1577,55 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    srli a4, a1, 3
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a4
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a3
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v24, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v24, v8, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vfmax.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1609,19 +1635,13 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  .LBB40_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1633,12 +1653,13 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1665,12 +1686,12 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index 928171e1f9528b..21251ee2f3c630 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -144,55 +144,155 @@ define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
 declare <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
 
 define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; CHECK-LABEL: vfmin_nxv32bf16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v0, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v3, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfmin_nxv32bf16_vv:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv8r.v v0, v8
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v0, v3
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmin.vv v16, v0, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFH-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFH-NEXT:    vfmin.vv v16, v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmin.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
   ret <vscale x 32 x bfloat> %v
 }
@@ -395,40 +495,62 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT:    vfmin.vv v8, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -436,7 +558,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -632,21 +755,19 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -669,14 +790,12 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 0f9ab5985c9dbc..818a90607ea073 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -45,11 +45,9 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -93,11 +91,9 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
@@ -143,11 +139,9 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -193,11 +187,9 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -264,11 +256,9 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -302,58 +292,64 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -371,32 +367,27 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -451,89 +442,114 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CH